# Anomaly: Gross Profit to asset ratio 

in this part, we will investigate the anomaly of Gross Profit to asset ratio. The reference of this part is [Novy-Marx, Robert, “The Other Side of Value: The Gross Profitability Premium,”](papers/❀The%20other%20side%20of%20value：The%20gross%20profitability%20premium.pdf) 

# 1. construct ratio factor

In [315]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from finance_byu import fama_macbeth
from numpy_ext import rolling_apply
from dateutil.relativedelta import relativedelta

In [42]:
# Load the data
data = pd.read_excel('source_data/利润表/FS_Comins.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


In [43]:
data.head(5)

Unnamed: 0,Stkcd,ShortName,Accper,Typrep,B001100000,B001200000
0,证券代码,证券简称,统计截止日期,报表类型,营业总收入,营业总成本
1,没有单位,没有单位,没有单位,没有单位,元,元
2,000001,深发展A,2000-01-01,A,1079657853,1070282071
3,000001,深发展A,2000-06-30,A,563942659,524513922
4,000001,深发展A,2000-12-31,A,1431286264,1259262034


In [46]:
data = data.iloc[2:, :]
data['B001100000'] = data['B001100000'].astype(float)
data['B001200000'] = data['B001200000'].astype(float)
data['gross profit'] = data['B001100000'] - data['B001200000']
data.head(5)

Unnamed: 0,Stkcd,ShortName,Accper,Typrep,B001100000,B001200000,gross profit
2,1,深发展A,2000-01-01,A,1079658000.0,1070282000.0,9375782.0
3,1,深发展A,2000-06-30,A,563942700.0,524513900.0,39428737.0
4,1,深发展A,2000-12-31,A,1431286000.0,1259262000.0,172024230.0
5,1,深发展A,2001-01-01,A,1431286000.0,1302838000.0,128448010.0
6,1,深发展A,2001-06-30,A,836514200.0,768024100.0,68490081.0


In [47]:
data = data[data['Typrep'] == "A"]
data = data[["Stkcd", "Accper", "gross profit"]]
data.head()

Unnamed: 0,Stkcd,Accper,gross profit
2,1,2000-01-01,9375782.0
3,1,2000-06-30,39428737.0
4,1,2000-12-31,172024230.0
5,1,2001-01-01,128448010.0
6,1,2001-06-30,68490081.0


In [48]:
data.columns = ['Stkcd', 'date', 'gross_profit']

In [49]:
data.head()

Unnamed: 0,Stkcd,date,gross_profit
2,1,2000-01-01,9375782.0
3,1,2000-06-30,39428737.0
4,1,2000-12-31,172024230.0
5,1,2001-01-01,128448010.0
6,1,2001-06-30,68490081.0


In [50]:
# this procedure is the same in part1
balance_sheet = pd.read_csv(r'source_data/Balance Sheet/balance_sheet.csv')  # load the balance sheet data
balance_sheet = balance_sheet[balance_sheet['Statement Type'] == "A"]
balance_sheet = balance_sheet[['Stock Code', 'Ending Date of Statistics', 'Total Assets', 'Total Shareholders’ Equity']]
balance_sheet.columns = ['Stkcd', 'date', 'total_assets', 'total_shareholders_equity']
balance_sheet = balance_sheet.iloc[1:, :]
balance_sheet['date'] = pd.to_datetime(balance_sheet['date'])
balance_sheet['month'] = balance_sheet['date'].dt.to_period('M')
balance_sheet = balance_sheet[['Stkcd', 'month', 'total_assets', 'total_shareholders_equity']]
balance_sheet

  balance_sheet = pd.read_csv(r'source_data/Balance Sheet/balance_sheet.csv')  # load the balance sheet data


Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity
2,000001,2000-06,49732336516,3078512556
3,000001,2000-12,67227499769,4738883655
4,000001,2001-01,66006167607,3517551493
5,000001,2001-06,85181426762,4961824149
6,000001,2001-12,120126983351,3627668792
...,...,...,...,...
583378,900957,2023-01,1066446596.43,566925821.84
583380,900957,2023-03,1070059261.75,575645952.6
583382,900957,2023-06,1067733802.62,586222511.87
583384,900957,2023-09,1060743496.42,598366406.44


In [52]:
# merge data and balance sheet
data['month'] = pd.to_datetime(data['date']).dt.to_period('M')
data = data.merge(balance_sheet, on=['Stkcd', 'month'], how='left')
data

Unnamed: 0,Stkcd,date,gross_profit,month,total_assets,total_shareholders_equity
0,000001,2000-01-01,9.375782e+06,2000-01,,
1,000001,2000-06-30,3.942874e+07,2000-06,49732336516,3078512556
2,000001,2000-12-31,1.720242e+08,2000-12,67227499769,4738883655
3,000001,2001-01-01,1.284480e+08,2001-01,66006167607,3517551493
4,000001,2001-06-30,6.849008e+07,2001-06,85181426762,4961824149
...,...,...,...,...,...,...
314441,900957,2023-01-01,3.559758e+07,2023-01,,
314442,900957,2023-03-31,9.810177e+06,2023-03,,
314443,900957,2023-06-30,2.004156e+07,2023-06,,
314444,900957,2023-09-30,3.071038e+07,2023-09,,


In [54]:
data['total_assets'] = data['total_assets'].astype(float)
data['gross_profit_to_asset'] = data['gross_profit'] / data['total_assets']
data.head()

Unnamed: 0,Stkcd,date,gross_profit,month,total_assets,total_shareholders_equity,gross_profit_to_asset
0,1,2000-01-01,9375782.0,2000-01,,,
1,1,2000-06-30,39428737.0,2000-06,49732340000.0,3078512556.0,0.000793
2,1,2000-12-31,172024230.0,2000-12,67227500000.0,4738883655.0,0.002559
3,1,2001-01-01,128448010.0,2001-01,66006170000.0,3517551493.0,0.001946
4,1,2001-06-30,68490081.0,2001-06,85181430000.0,4961824149.0,0.000804


In [55]:
monthly_stock_return = pd.read_feather('temp/monthly_stock_return.feather')
monthly_stock_return

Unnamed: 0,Stkcd,month,market_value,monthly_stock_return
2,000001,2000-01,28755726.61,0.061891
3,000001,2000-02,28429838.73,-0.011333
4,000001,2000-03,28507431.08,0.002729
5,000001,2000-04,29562687.10,0.037017
6,000001,2000-05,27933247.66,-0.055118
...,...,...,...,...
730496,900957,2023-09,139949.00,-0.067446
730497,900957,2023-10,139251.00,-0.004992
730498,900957,2023-11,144486.00,0.037596
730499,900957,2023-12,144137.00,-0.002418


In [56]:
monthly_stock_return.dtypes

Stkcd                      object
month                   period[M]
market_value              float64
monthly_stock_return      float64
dtype: object

In [59]:
monthly_stock_return = monthly_stock_return.merge(data[['Stkcd', 'month', 'gross_profit_to_asset']],
                                                  on=['Stkcd', 'month'], how='left').bfill()
monthly_stock_return

Unnamed: 0,Stkcd,month,market_value,monthly_stock_return,gross_profit_to_asset
0,000001,2000-01,28755726.61,0.061891,0.000793
1,000001,2000-02,28429838.73,-0.011333,0.000793
2,000001,2000-03,28507431.08,0.002729,0.000793
3,000001,2000-04,29562687.10,0.037017,0.000793
4,000001,2000-05,27933247.66,-0.055118,0.000793
...,...,...,...,...,...
730494,900957,2023-09,139949.00,-0.067446,
730495,900957,2023-10,139251.00,-0.004992,
730496,900957,2023-11,144486.00,0.037596,
730497,900957,2023-12,144137.00,-0.002418,


In [61]:
monthly_stock_return = monthly_stock_return.dropna()
monthly_stock_return['GP/Assets_tag'] = monthly_stock_return.groupby(['month'])['gross_profit_to_asset'].transform(
    lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
monthly_stock_return

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_stock_return['GP/Assets_tag'] = monthly_stock_return.groupby(['month'])['gross_profit_to_asset'].transform(lambda x:pd.qcut(x, 10, labels=False, duplicates='drop'))


Unnamed: 0,Stkcd,month,market_value,monthly_stock_return,gross_profit_to_asset,GP/Assets_tag
0,000001,2000-01,28755726.61,0.061891,0.000793,2
1,000001,2000-02,28429838.73,-0.011333,0.000793,2
2,000001,2000-03,28507431.08,0.002729,0.000793,2
3,000001,2000-04,29562687.10,0.037017,0.000793,2
4,000001,2000-05,27933247.66,-0.055118,0.000793,2
...,...,...,...,...,...,...
86574,000783,2011-09,20914282.46,0.003413,0.014607,4
86575,000783,2011-10,22242173.41,0.063492,0.014719,4
86576,000783,2011-11,18614185.64,-0.163113,0.014719,4
86577,000783,2011-12,16954321.95,-0.089172,0.014719,4


In [64]:
# import and clean the risk-free rate data
rf = pd.read_excel('source_data/Risk-Free Rate/TRD_Nrrate.xlsx')  # load the risk-free rate data
rf = rf.iloc[2:, :]
rf = rf[['Clsdt', 'Nrrdaydt', 'Nrrdata']]
rf.columns = ['date', 'daily_rf', 'rf']
rf['date'] = pd.to_datetime(rf['date'])
rf['daily_rf'] = rf['daily_rf'] / 100  # change the unit of the risk-free rate to decimal
rf['rf'] = rf['rf'] / 100  # change the unit of the risk-free rate to decimal
rf['month'] = rf['date'].dt.to_period('M')
rf_monthly = rf.groupby('month').agg({'daily_rf': [lambda x: (1 + x).prod() - 1], 'rf': 'last'})
rf_monthly.reset_index(inplace=True)
rf_monthly.columns = ['month', 'monthly_rf', 'rf']

# merge the risk-free rate data into monthly_stock_return
monthly_stock_return['rf'] = monthly_stock_return['month'].map(rf_monthly.set_index('month')['monthly_rf'])

  warn("Workbook contains no default style, apply openpyxl's default")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_stock_return['rf'] = monthly_stock_return['month'].map(rf_monthly.set_index('month')['monthly_rf'])


In [78]:
monthly_stock_return['excess_return'] = monthly_stock_return['monthly_stock_return'] - monthly_stock_return['rf']

vw_group = monthly_stock_return.groupby(['month', 'GP/Assets_tag'], observed=False).apply(
    lambda x: np.average(x['excess_return'], weights=x['market_value']), include_groups=False).unstack()
vw_group = vw_group.reset_index()
vw_group['9 - 0'] = vw_group[9] - vw_group[0]

ew_group = monthly_stock_return.groupby(['month', 'GP/Assets_tag'], observed=False)['excess_return'].mean().unstack()
ew_group = ew_group.reset_index()
ew_group['9 - 0'] = ew_group[9] - ew_group[0]
ew_group

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_stock_return['excess_return'] = monthly_stock_return['monthly_stock_return'] - monthly_stock_return['rf']


GP/Assets_tag,month,0,1,2,3,4,5,6,7,8,9,9 - 0
0,2000-01,0.025264,0.074104,0.163821,0.137538,0.195758,0.167055,0.142019,0.154009,0.223832,0.184372,0.159109
1,2000-02,0.151682,0.175988,0.133681,0.134919,0.156823,0.133863,0.165835,0.172363,0.121170,0.084669,-0.067012
2,2000-03,0.200701,0.159007,0.115812,0.119012,0.086736,0.094692,0.123790,0.051767,0.063647,0.143851,-0.056850
3,2000-04,-0.007980,-0.004877,0.013227,-0.019040,0.028792,-0.004608,0.037414,0.010979,0.012676,0.050757,0.058737
4,2000-05,0.005894,0.057223,0.032928,0.019074,0.058036,0.023646,0.032350,0.040034,0.023077,0.080215,0.074321
...,...,...,...,...,...,...,...,...,...,...,...,...
284,2023-09,0.034149,-0.002940,-0.019211,-0.014027,-0.002333,0.005446,0.011416,-0.001415,-0.009313,-0.006572,-0.040722
285,2023-10,-0.042980,-0.004086,-0.007224,-0.017027,-0.017937,0.011412,-0.018248,0.013465,-0.003394,-0.021321,0.021659
286,2023-11,0.066009,0.050311,0.038859,0.024251,0.011553,0.032636,0.019972,0.012875,0.016962,0.003449,-0.062559
287,2023-12,-0.043321,-0.049720,-0.024832,-0.040354,-0.038167,-0.033185,-0.026216,-0.022889,0.010012,-0.028517,0.014804


In [79]:
stats_list = ['mean', 'var', 'min', 'max', 'std',
              lambda x: x.quantile(.25), lambda x: x.quantile(.5),
              lambda x: x.quantile(.75), 'skew', lambda
                  x: x.kurt()]

# value-weighted
vw_result = vw_group.agg({0: stats_list, 1: stats_list, 2: stats_list, 3: stats_list, 4: stats_list,
                          5: stats_list, 6: stats_list, 7: stats_list, 8: stats_list, 9: stats_list, '9 - 0':stats_list})
vw_result.index = ['mean', 'var', 'min', 'max', 'std', '25%', '50%', '75%', 'skew', 'kurt']
vw_result

GP/Assets_tag,0,1,2,3,4,5,6,7,8,9,9 - 0
mean,0.012969,0.013381,0.012322,0.013093,0.012601,0.014398,0.029633,0.018327,0.023322,0.031214,0.018245
var,0.012625,0.01109,0.009021,0.008712,0.010013,0.012434,0.028744,0.008849,0.010709,0.010192,0.010762
min,-0.312998,-0.29995,-0.30071,-0.263064,-0.256128,-0.268798,-0.290756,-0.331451,-0.310298,-0.232805,-0.210186
max,0.340743,0.408777,0.331057,0.3285,0.561338,1.086359,2.208695,0.33579,0.676209,0.743072,0.889282
std,0.11236,0.105311,0.09498,0.093338,0.100064,0.11151,0.169541,0.09407,0.103482,0.100954,0.103739
25%,-0.057168,-0.044449,-0.047141,-0.044082,-0.052793,-0.042808,-0.043375,-0.037395,-0.035651,-0.027767,-0.030769
50%,0.006933,0.008691,0.008969,0.005961,0.00468,0.006224,0.009548,0.013784,0.012144,0.024838,0.009234
75%,0.072105,0.063824,0.062042,0.059956,0.063789,0.065287,0.06744,0.060339,0.06991,0.081545,0.059103
skew,0.22326,0.508674,0.264245,0.20518,0.851244,3.25502,7.92147,0.054181,1.174581,2.221106,2.948672
kurt,0.58395,1.975065,1.007166,0.912528,3.587859,29.132379,96.208037,1.390667,6.071018,14.783951,20.684583


In [80]:
# equal-weighted
ew_result = ew_group.agg({0: stats_list, 1: stats_list, 2: stats_list, 3: stats_list, 4: stats_list,
                            5: stats_list, 6: stats_list, 7: stats_list, 8: stats_list, 9: stats_list, '9 - 0':stats_list})
ew_result.index = ['mean', 'var', 'min', 'max', 'std', '25%', '50%', '75%', 'skew', 'kurt']
ew_result

GP/Assets_tag,0,1,2,3,4,5,6,7,8,9,9 - 0
mean,0.007077,0.008029,0.008988,0.007644,0.008019,0.00873,0.013899,0.014034,0.018861,0.024394,0.017317
var,0.012722,0.011008,0.010258,0.009322,0.009034,0.008816,0.009179,0.008742,0.009896,0.010202,0.007004
min,-0.304891,-0.305499,-0.295526,-0.301732,-0.317174,-0.274656,-0.278625,-0.335562,-0.310874,-0.265863,-0.219637
max,0.355813,0.35155,0.447874,0.376349,0.377711,0.367772,0.444006,0.37323,0.436318,0.821806,0.692953
std,0.112794,0.10492,0.10128,0.096552,0.09505,0.093892,0.095807,0.093498,0.099477,0.101006,0.083693
25%,-0.057558,-0.053446,-0.05025,-0.048244,-0.049419,-0.048769,-0.040611,-0.041626,-0.036506,-0.030874,-0.030478
50%,0.002343,0.001521,0.00151,0.001736,0.00319,0.005377,0.006147,0.008873,0.010012,0.014519,0.009709
75%,0.06816,0.063295,0.061474,0.058789,0.058532,0.056834,0.056894,0.059136,0.059674,0.066868,0.058737
skew,0.278498,0.278985,0.368206,0.164116,0.199845,0.299027,0.550018,0.328049,0.664931,2.141142,2.076967
kurt,0.783715,1.074761,1.92346,1.243605,1.382076,1.155443,2.441847,1.862616,2.391541,15.297648,14.553929


from sheets above, we can find that the value-weighted and equal-weighted returns of the 9th group are higher than the 0th group. This means that the Gross Profit to asset ratio has a positive relationship with the stock return.

# 2. check the correlation of ratio factor with FF 5

In this part, we will check the correlation of the Gross Profit to asset ratio with the Fama-French 5 factors.

In [81]:
factor_data = pd.read_excel(
    'source_data/5-Factor Model Index (Monthly)/STK_MKT_FIVEFACMONTH.xlsx')
factor_data = factor_data[
    ['MarkettypeID', 'TradingMonth', 'Portfolios', 'RiskPremium2', 'SMB2', 'HML2', 'RMW2', 'CMA2']]
factor_data.columns = ['MarkettypeID', 'date', 'portfolios', 'risk_premium', 'smb', 'hml', 'rmw', 'cma']
factor_data = factor_data.iloc[2:, :]
factor_data = factor_data[factor_data['portfolios'] == 1]
factor_data = factor_data[
    factor_data['MarkettypeID'] == "P9706"]  # P9706: SSE-SZSE A share market (excluding STAR Market, ChiNext)
factor_data.reset_index(drop=True, inplace=True)  # reset the index to 0,1,2,...
factor_data['date'] = pd.to_datetime(factor_data['date']).dt.to_period('M')
factor_data = factor_data[['date', 'risk_premium', 'smb', 'hml', 'rmw', 'cma']]
factor_data.columns = ['month', 'risk_premium', 'SMB', 'HML', 'RMW', 'CMA']
factor_data.head(5)

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
0,P9706,2000-01,1,0.135225,-0.005175,-0.104151,0.042289,-0.076779
1,P9706,2000-02,1,0.113951,0.032327,-0.002393,-0.011365,0.03855
2,P9706,2000-03,1,0.058133,0.069624,0.01608,-0.051447,0.065904
3,P9706,2000-04,1,0.015578,-0.010779,0.02358,-0.022984,0.00972
4,P9706,2000-05,1,0.027197,0.025777,0.025355,-0.00662,-0.005409


In [89]:
factor_data = factor_data.merge(vw_group[['month', '9 - 0']], on='month', how='left')
factor_data = factor_data.rename(columns={'9 - 0': 'vw_GP/Assets'})
factor_data = factor_data.merge(ew_group[['month', '9 - 0']], on='month', how='left')
factor_data = factor_data.rename(columns={'9 - 0': 'ew_GP/Assets'})
factor_data

In [109]:
factor_data['risk_premium'] = factor_data['risk_premium'].astype(float)
factor_data['SMB'] = factor_data['SMB'].astype(float)
factor_data['HML'] = factor_data['HML'].astype(float)
factor_data['RMW'] = factor_data['RMW'].astype(float)
factor_data['CMA'] = factor_data['CMA'].astype(float)

## 2.1 Correlation

In [110]:
factor_data[['risk_premium', 'SMB', 'HML', 'RMW', 'CMA', 'vw_GP/Assets', 'ew_GP/Assets']].corr()

Unnamed: 0,risk_premium,SMB,HML,RMW,CMA,vw_GP/Assets,ew_GP/Assets
risk_premium,1.0,0.095291,-0.137413,-0.252714,0.103738,-0.192167,-0.055339
SMB,0.095291,1.0,-0.534818,-0.732352,0.435422,-0.500132,-0.489566
HML,-0.137413,-0.534818,1.0,0.260956,0.109132,0.153055,0.188274
RMW,-0.252714,-0.732352,0.260956,1.0,-0.708683,0.567996,0.546158
CMA,0.103738,0.435422,0.109132,-0.708683,1.0,-0.428637,-0.440299
vw_GP/Assets,-0.192167,-0.500132,0.153055,0.567996,-0.428637,1.0,0.810959
ew_GP/Assets,-0.055339,-0.489566,0.188274,0.546158,-0.440299,0.810959,1.0


In [111]:
factor_data[['risk_premium', 'SMB', 'HML', 'RMW', 'CMA', 'vw_GP/Assets', 'ew_GP/Assets']].corr('spearman')

Unnamed: 0,risk_premium,SMB,HML,RMW,CMA,vw_GP/Assets,ew_GP/Assets
risk_premium,1.0,0.175743,-0.162226,-0.292515,0.114944,-0.235134,-0.147382
SMB,0.175743,1.0,-0.494069,-0.692073,0.435525,-0.57521,-0.523783
HML,-0.162226,-0.494069,1.0,0.259921,0.06071,0.229719,0.218825
RMW,-0.292515,-0.692073,0.259921,1.0,-0.690923,0.675647,0.605857
CMA,0.114944,0.435525,0.06071,-0.690923,1.0,-0.496913,-0.487787
vw_GP/Assets,-0.235134,-0.57521,0.229719,0.675647,-0.496913,1.0,0.844751
ew_GP/Assets,-0.147382,-0.523783,0.218825,0.605857,-0.487787,0.844751,1.0


## 2.2 Regression

In [113]:
Y = factor_data['vw_GP/Assets']
X = factor_data[['risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,vw_GP/Assets,R-squared:,0.351
Model:,OLS,Adj. R-squared:,0.34
Method:,Least Squares,F-statistic:,30.61
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,7.38e-25
Time:,19:52:44,Log-Likelihood:,307.73
No. Observations:,289,AIC:,-603.5
Df Residuals:,283,BIC:,-581.5
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0240,0.005,4.669,0.000,0.014,0.034
risk_premium,-12.8690,7.340,-1.753,0.081,-27.317,1.579
SMB,-0.6244,0.195,-3.204,0.002,-1.008,-0.241
HML,-0.2556,0.201,-1.269,0.206,-0.652,0.141
RMW,1.0864,0.320,3.393,0.001,0.456,1.717
CMA,-0.2800,0.335,-0.836,0.404,-0.939,0.379

0,1,2,3
Omnibus:,328.04,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21838.889
Skew:,4.748,Prob(JB):,0.0
Kurtosis:,44.514,Cond. No.,1480.0


In [114]:
Y = factor_data['ew_GP/Assets']
X = factor_data[['risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,ew_GP/Assets,R-squared:,0.329
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,27.78
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,7.209999999999999e-23
Time:,19:53:00,Log-Likelihood:,365.02
No. Observations:,289,AIC:,-718.0
Df Residuals:,283,BIC:,-696.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0201,0.004,4.765,0.000,0.012,0.028
risk_premium,7.1328,6.020,1.185,0.237,-4.718,18.983
SMB,-0.3421,0.160,-2.141,0.033,-0.657,-0.028
HML,0.0827,0.165,0.500,0.617,-0.243,0.408
RMW,0.8495,0.263,3.235,0.001,0.333,1.366
CMA,-0.5280,0.275,-1.923,0.055,-1.069,0.012

0,1,2,3
Omnibus:,238.281,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8063.593
Skew:,2.991,Prob(JB):,0.0
Kurtosis:,28.177,Cond. No.,1480.0


from the regression results above, we can find that the Gross Profit to asset ratio has relationship with SMB and RMW.

And since R-squared is very low, we can conclude that the Gross Profit is a new factor different from FF5.

# 3. Fama-macbeth regression on FF5 and Gross Profit to asset ratio

we will run two Fama-macbeth regression：on FF5, and FF5 with Gross Profit to asset ratio. We want to know:
1. whether the Gross Profit to asset ratio is a new factor by comparing  whether the coefficient of Gross Profit to asset ratio is significant.
2. whether the Gross Profit to asset ratio can explain the cross-section stock return better by comparing the R-squared of the two regressions and the alpha of the two regressions.

It should be pointed out that, in this part, we use portfolio level while in part 2 we use stock level. 

Moreover, we use rolling regression with window=12 to calculate the factor loading, while in part 2 we use the definition of $\beta: \frac{cov}{var}$ to get loading

In [122]:
monthly_stock_return

Unnamed: 0,Stkcd,month,market_value,monthly_stock_return,gross_profit_to_asset,GP/Assets_tag,rf,excess_return
0,000001,2000-01,28755726.61,0.061891,0.000793,2,0.001893,0.059998
1,000001,2000-02,28429838.73,-0.011333,0.000793,2,0.001771,-0.013104
2,000001,2000-03,28507431.08,0.002729,0.000793,2,0.001893,0.000836
3,000001,2000-04,29562687.10,0.037017,0.000793,2,0.001832,0.035185
4,000001,2000-05,27933247.66,-0.055118,0.000793,2,0.001893,-0.057011
...,...,...,...,...,...,...,...,...
86574,000783,2011-09,20914282.46,0.003413,0.014607,4,0.002824,0.000589
86575,000783,2011-10,22242173.41,0.063492,0.014719,4,0.002918,0.060574
86576,000783,2011-11,18614185.64,-0.163113,0.014719,4,0.002824,-0.165937
86577,000783,2011-12,16954321.95,-0.089172,0.014719,4,0.002918,-0.092090


In [127]:
monthly_stock_return['mkt_risk_premium'] = monthly_stock_return['month'].map(factor_data.set_index('month')['risk_premium'])
monthly_stock_return['SMB'] = monthly_stock_return['month'].map(factor_data.set_index('month')['SMB'])
monthly_stock_return['HML'] = monthly_stock_return['month'].map(factor_data.set_index('month')['HML'])
monthly_stock_return['RMW'] = monthly_stock_return['month'].map(factor_data.set_index('month')['RMW'])
monthly_stock_return['CMA'] = monthly_stock_return['month'].map(factor_data.set_index('month')['CMA'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_stock_return['mkt_risk_premium'] = monthly_stock_return['month'].map(factor_data.set_index('month')['risk_premium'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_stock_return['SMB'] = monthly_stock_return['month'].map(factor_data.set_index('month')['SMB'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

In [None]:
def simple_reg(y, x1, x2, x3, x4, x5):
    x = sm.add_constant(pd.DataFrame([x1, x2, x3, x4, x5]).T)
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    return model.params

In [258]:
tmp1 = monthly_stock_return.groupby(['month', 'GP/Assets_tag'], observed=False)['excess_return'].mean().reset_index()
tmp1 = tmp1.merge(factor_data, on='month', how='left')
tmp2 = tmp1.groupby(['GP/Assets_tag']).apply(lambda x: rolling_apply(simple_reg, 12,x['excess_return'], x['risk_premium'], x['SMB'], x['HML'], x['RMW'], x['CMA']))
tmp3 = tmp2[0]
tmp3 = pd.DataFrame(tmp3)
tmp3.columns = ['intercept', 'risk_premium_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading']
tmp3['GP/Assets_tag'] = 0
tmp3['month'] = factor_data['month']

for i in range(1, 10):
    tmp4 = tmp2[i]
    tmp4 = pd.DataFrame(tmp4)
    tmp4.columns = ['intercept', 'risk_premium_loading', 'SMB_loading', 'HML_Loading', 'RMW_loading', 'CMA_loading']
    tmp4['GP/Assets_tag'] = i
    tmp4['month'] = factor_data['month']
    tmp3 = pd.concat([tmp3,tmp4])

tmp1 = tmp1.merge(tmp3, on=['month', 'GP/Assets_tag'], how='left')
# monthly_stock_return.drop(columns=['mkt_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading'])

  tmp2 = tmp1.groupby(['GP/Assets_tag']).apply(lambda x: rolling_apply(simple_reg, 12,x['excess_return'], x['risk_premium'], x['SMB'], x['HML'], x['RMW'], x['CMA']))


In [263]:
tmp1

Unnamed: 0,month,GP/Assets_tag,excess_return,risk_premium,SMB,HML,RMW,CMA,vw_GP/Assets,ew_GP/Assets,intercept,risk_premium_loading,SMB_loading,HML_loading,RMW_loading,CMA_loading,HML_Loading,t+1_excess_return
0,2000-01,0,0.025264,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109,,,,,,,,0.151682
1,2000-01,1,0.074104,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109,,,,,,,,0.175988
2,2000-01,2,0.163821,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109,,,,,,,,0.133681
3,2000-01,3,0.137538,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109,,,,,,,,0.134919
4,2000-01,4,0.195758,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109,,,,,,,,0.156823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2885,2024-01,5,-0.098389,-0.000571,-0.108162,0.108981,-0.003561,-0.007933,0.084720,-0.024187,0.007293,150.038557,0.415854,,-1.584795,-0.838439,0.083820,
2886,2024-01,6,-0.116874,-0.000571,-0.108162,0.108981,-0.003561,-0.007933,0.084720,-0.024187,-0.000041,107.501879,0.521687,,-0.778533,-0.202019,0.014687,
2887,2024-01,7,-0.137894,-0.000571,-0.108162,0.108981,-0.003561,-0.007933,0.084720,-0.024187,-0.012981,84.298594,1.008163,,0.226058,0.459193,0.434305,
2888,2024-01,8,-0.107867,-0.000571,-0.108162,0.108981,-0.003561,-0.007933,0.084720,-0.024187,0.006425,121.228900,0.004937,,-0.223974,-0.155997,-0.431954,


In [261]:
tmp1['t+1_excess_return'] = tmp1.groupby("GP/Assets_tag")['excess_return'].shift(-1)

In [262]:
ff5_result = fama_macbeth.fm_summary(fama_macbeth.fama_macbeth(tmp1[['month', 't+1_excess_return', 'risk_premium_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading']].dropna(),'month', 't+1_excess_return', ['risk_premium_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading']), pvalues=True)
ff5_result

Unnamed: 0,mean,std_error,tstat,pval
intercept,6.8e-05,5.5e-05,1.22751,0.220677
risk_premium_loading,0.000196,0.000131,1.492575,0.136691
SMB_loading,-9.4e-05,8.5e-05,-1.098965,0.272741
HML_loading,7e-05,6e-05,1.179626,0.239165
RMW_loading,-0.000224,0.000202,-1.109941,0.26799
CMA_loading,0.000102,7.5e-05,1.358958,0.175269


In [267]:
ff5_result.to_excel('output/part3/3. ff5_result.xlsx')

In [265]:
def simple_reg6(y, x1, x2, x3, x4, x5, x6):
    x = sm.add_constant(pd.DataFrame([x1, x2, x3, x4, x5, x6]).T)
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    return model.params

tmp1 = monthly_stock_return.groupby(['month', 'GP/Assets_tag'], observed=False)['excess_return'].mean().reset_index()
tmp1 = tmp1.merge(factor_data, on='month', how='left')
tmp2 = tmp1.groupby(['GP/Assets_tag']).apply(lambda x: rolling_apply(simple_reg6, 12,x['excess_return'], x['risk_premium'], x['SMB'], x['HML'], x['RMW'], x['CMA'], x['vw_GP/Assets']))
tmp3 = tmp2[0]
tmp3 = pd.DataFrame(tmp3)
tmp3.columns = ['intercept', 'risk_premium_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading', 'GP/Assets_loading']
tmp3['GP/Assets_tag'] = 0
tmp3['month'] = factor_data['month']

for i in range(1, 10):
    tmp4 = tmp2[i]
    tmp4 = pd.DataFrame(tmp4)
    tmp4.columns = ['intercept', 'risk_premium_loading', 'SMB_loading', 'HML_Loading', 'RMW_loading', 'CMA_loading', 'GP/Assets_loading']
    tmp4['GP/Assets_tag'] = i
    tmp4['month'] = factor_data['month']
    tmp3 = pd.concat([tmp3,tmp4])

tmp1 = tmp1.merge(tmp3, on=['month', 'GP/Assets_tag'], how='left')

  tmp2 = tmp1.groupby(['GP/Assets_tag']).apply(lambda x: rolling_apply(simple_reg6, 12,x['excess_return'], x['risk_premium'], x['SMB'], x['HML'], x['RMW'], x['CMA'], x['vw_GP/Assets']))


In [266]:
ff5_with_ratio_result = fama_macbeth.fm_summary(fama_macbeth.fama_macbeth(monthly_stock_return[['month', 't+1_excess_return', 'mkt_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading', 'GP/Assets_loading']].replace([np.inf, -np.inf], np.nan).dropna(),'month', 't+1_excess_return', ['mkt_loading', 'SMB_loading', 'HML_loading', 'RMW_loading', 'CMA_loading', 'GP/Assets_loading']), pvalues=True)
ff5_with_ratio_result

Unnamed: 0,mean,std_error,tstat,pval
intercept,0.00554,0.005121,1.081798,0.280315
mkt_loading,3.2e-05,2.9e-05,1.106692,0.269419
SMB_loading,0.000188,0.004042,0.046601,0.962866
HML_loading,-0.00056,0.00204,-0.274392,0.783995
RMW_loading,0.003982,0.005753,0.692169,0.489431
CMA_loading,0.003063,0.00298,1.027891,0.304928
GP/Assets_loading,-6.2e-05,0.00013,-0.47977,0.631783


In [268]:
ff5_with_ratio_result.to_excel('output/part3/3. ff5_with_ratio_result.xlsx')

# 4. Using different model to explain the abnormal return of each group of Gross Profit to asset ratio

the target of this part is to find the alpha of group 0-9 using different models. And if different groups show different alpha, it means that the Gross Profit to asset ratio is indeed a new factor.

the output is similar to what teacher had in her ppt "part 3 page 16”, which is shown below:

![pic1.png](img/pic1.png)

In [270]:
vw_reg = vw_group.merge(factor_data, on='month', how='left')
ew_reg = ew_group.merge(factor_data, on='month', how='left')
vw_reg

Unnamed: 0,month,0,1,2,3,4,5,6,7,8,9,9 - 0,risk_premium,SMB,HML,RMW,CMA,vw_GP/Assets,ew_GP/Assets
0,2000-01,0.041487,0.110694,0.148717,0.125170,0.363679,0.153833,0.140541,0.176434,0.250842,0.277470,0.235983,0.001352,-0.005175,-0.104151,0.042289,-0.076779,0.235983,0.159109
1,2000-02,0.153813,0.187468,0.117026,0.123145,0.118053,0.203588,0.163152,0.225821,0.127111,0.081690,-0.072123,0.001140,0.032327,-0.002393,-0.011365,0.038550,-0.072123,-0.067012
2,2000-03,0.202511,0.163047,0.113997,0.172959,0.066662,0.050036,0.152983,0.030400,0.046828,0.085290,-0.117221,0.000581,0.069624,0.016080,-0.051447,0.065904,-0.117221,-0.056850
3,2000-04,0.013512,0.031477,0.041104,-0.023696,0.041892,-0.003396,0.093339,0.023434,0.014600,0.082781,0.069270,0.000156,-0.010779,0.023580,-0.022984,0.009720,0.069270,0.058737
4,2000-05,0.009731,0.054129,0.028487,0.005961,0.029599,0.011899,0.018814,0.029398,0.025569,0.067565,0.057835,0.000272,0.025777,0.025355,-0.006620,-0.005409,0.057835,0.074321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,2023-09,0.099729,0.027375,-0.029311,-0.016248,-0.011449,0.000304,0.009548,0.010110,-0.021790,-0.023566,-0.123295,-0.000023,0.009100,0.015482,-0.004970,-0.003365,-0.123295,-0.040722
285,2023-10,-0.061111,0.026048,0.007699,-0.006055,-0.037969,-0.037187,0.011301,0.033447,-0.056399,-0.027629,0.033482,-0.000294,0.026882,-0.009619,-0.016592,-0.003727,0.033482,0.021659
286,2023-11,0.074403,0.050556,0.021693,0.002233,0.003798,0.004118,0.120351,0.028812,0.004590,-0.019134,-0.093537,-0.000016,0.052495,-0.008304,-0.021795,0.019977,-0.093537,-0.062559
287,2023-12,-0.031025,-0.061531,-0.035893,-0.023191,-0.024813,-0.042051,-0.060085,-0.036395,0.012318,-0.028875,0.002150,-0.000154,0.005892,0.002136,0.019856,-0.014981,0.002150,0.014804


In [286]:
def get_a_panel(x):
    result = pd.DataFrame([[0,]*11]*6, columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q10-Q1'], index=['CAPM','t-CAPM', 'FF3', 't-FF3', 'FF5', 't-FF5'], dtype=float)
    
    for i in range(10):
        temp = x[[i, 'risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
        Y = temp[i]
        X = temp[['risk_premium']]
        X = sm.add_constant(X)
        model = sm.OLS(Y, X).fit()
        result.iloc[0,i] = model.params.iloc[0]
        result.iloc[1,i] = model.tvalues.iloc[0]
        
        X = temp[['risk_premium', 'SMB', 'HML']]
        X = sm.add_constant(X)
        model = sm.OLS(Y, X).fit()
        result.iloc[2,i] = model.params.iloc[0]
        result.iloc[3,i] = model.tvalues.iloc[0]
        
        X = temp[['risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
        X = sm.add_constant(X)
        model = sm.OLS(Y, X).fit()
        result.iloc[4,i] = model.params.iloc[0]
        result.iloc[5,i] = model.tvalues.iloc[0]
    return result

vw_panel = get_a_panel(vw_reg)
ew_panel = get_a_panel(ew_reg)

In [288]:
vw_panel.to_excel('output/part3/4. vw_panel.xlsx')
ew_panel.to_excel('output/part3/4. ew_panel.xlsx')

# 5. Calculating alpha of long-short strategy

in this part, we want to conclude a similar table in teacher's PPT "part 3 page 14", which is shown below

![pic2.png](img/pic2.png)

In [289]:
monthly_stock_return

Unnamed: 0,Stkcd,month,market_value,monthly_stock_return,gross_profit_to_asset,GP/Assets_tag,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA,mkt_loading,SMB_loading,HML_loading,RMW_loading,CMA_loading,GP/Assets_loading,t+1_excess_return
0,000001,2000-01,28755726.61,0.061891,0.000793,2,0.001893,0.059998,0.001352,-0.005175,-0.104151,0.042289,-0.076779,,,,,,,-0.013104
1,000001,2000-02,28429838.73,-0.011333,0.000793,2,0.001771,-0.013104,0.001140,0.032327,-0.002393,-0.011365,0.038550,,,,,,,0.000836
2,000001,2000-03,28507431.08,0.002729,0.000793,2,0.001893,0.000836,0.000581,0.069624,0.016080,-0.051447,0.065904,,,,,,,0.035185
3,000001,2000-04,29562687.10,0.037017,0.000793,2,0.001832,0.035185,0.000156,-0.010779,0.023580,-0.022984,0.009720,,,,,,,-0.057011
4,000001,2000-05,27933247.66,-0.055118,0.000793,2,0.001893,-0.057011,0.000272,0.025777,0.025355,-0.006620,-0.005409,,,,,,,0.005390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86574,000783,2011-09,20914282.46,0.003413,0.014607,4,0.002824,0.000589,-0.000895,-0.026651,0.033594,0.000232,0.027549,187.723146,-0.680276,0.495249,-3.991312,4.162236,-0.253802,0.060574
86575,000783,2011-10,22242173.41,0.063492,0.014719,4,0.002918,0.060574,0.000438,-0.007256,0.027475,-0.006046,-0.007476,188.232367,-0.844784,0.677530,-4.886126,4.520145,0.416858,-0.165937
86576,000783,2011-11,18614185.64,-0.163113,0.014719,4,0.002824,-0.165937,-0.000549,0.015033,-0.023096,-0.007514,0.012056,195.649337,-0.871510,0.783313,-4.776836,4.237369,-0.663096,-0.092090
86577,000783,2011-12,16954321.95,-0.089172,0.014719,4,0.002918,-0.092090,-0.000715,-0.082247,0.079022,0.077784,-0.045039,192.982574,-0.434647,0.222323,-2.470401,3.521992,-0.943478,0.094984


In [308]:
# we use average level as the indicator of GP/Assets tag
tmp = monthly_stock_return.groupby('Stkcd')['gross_profit_to_asset'].rolling(3).mean().reset_index()
tmp.columns = ["Stkcd", 'month', 'past_3_GP/Assets']

# tmp['month'] = pd.to_datetime(tmp['month']).dt.to_period('M')

In [309]:
tmp

Unnamed: 0,Stkcd,month,past_3_GP/Assets
0,000001,0,
1,000001,1,
2,000001,2,0.000793
3,000001,3,0.000793
4,000001,4,0.000793
...,...,...,...
86280,000783,86574,0.014607
86281,000783,86575,0.014644
86282,000783,86576,0.014682
86283,000783,86577,0.014719


In [319]:
start_date = pd.Timestamp('2000-01-01')
tmp['month'].apply(lambda x: start_date + relativedelta(months=x))

0        2000-01-01 00:00:00
1        2000-02-01 00:00:00
2        2000-03-01 00:00:00
3        2000-04-01 00:00:00
4        2000-05-01 00:00:00
                ...         
86280    9214-07-01 00:00:00
86281    9214-08-01 00:00:00
86282    9214-09-01 00:00:00
86283    9214-10-01 00:00:00
86284    9214-11-01 00:00:00
Name: month, Length: 86285, dtype: object