**每周按照prorxy ( volatility,size,profitability,EPR ) 从低到高分成四组，用g_proxy_i ( i=0,1,2,3 )表示，再将不同组内的股票按照WOR从低到高分成10组，用g_wor_i(i=0,1,...9)表示，并计算每个小组一周后的收益率WOR_1,比较不同proxy组内g_wor_0和g_wor_9收益率差异，来证明波动率大，市值小，利润低，EPR低的公司WOR更高**

In [1]:
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
#两个函数，partition用于分组，ttest用于t检验

#key ：关键值， n：分组数
def partition(x, key,n):
    return pd.qcut(x[key],n,duplicates='drop',labels=False) #按照quantile分10组，


def ttest(a1,a2):
    if stats.levene(a1, a2)[1]>0.05: #不具备方差奇性
        rst=stats.ttest_ind(a1,a2, equal_var = False)
    else:
        rst=stats.ttest_ind(a1,a2, equal_var = True)
    return rst[0]

In [3]:
#导入整理好的汇总数据
df=pd.read_csv('../data/total.csv')
df=df[['Date','Code','WOR','WTR','Vol','Size','Profitability','EPR']] #重新分组，不读取group_num
df.Date=pd.to_datetime(df.Date)
display(df.head())

Unnamed: 0,Date,Code,WOR,WTR,Vol,Size,Profitability,EPR
0,2009-03-03,600718.SH,-0.499369,,0.210875,295325.492124,0.142162,28528420.0
1,2009-03-03,600991.SH,-0.411673,-0.066589,0.200163,96569.256078,0.040064,11613470.0
2,2009-03-03,000425.SZ,-0.407139,0.017674,0.203814,589681.232985,-0.059167,-2670190.0
3,2009-03-03,000560.SZ,-0.278895,-0.185841,0.160781,47291.58565,0.009247,658890.8
4,2009-03-03,600433.SH,-0.236562,-0.167568,0.190481,30333.6,-0.058891,-3167137.0


In [4]:
#WOR_1 每只股票一周后的ROE
df=df.sort_values(by=['Code','Date']).reset_index(drop=True)
#wor_1 is wor of next week  
w1=df[['Date','Code','WOR']].copy().rename(columns={'WOR':'WOR_1'})
w1.Date=w1.Date-pd.Timedelta(weeks=1)
df=df.merge(w1,on=['Date','Code'])
#df['WOR_1']=df['WOR_1']-df['WOR']
display(df.head())

Unnamed: 0,Date,Code,WOR,WTR,Vol,Size,Profitability,EPR,WOR_1
0,2009-03-03,000001.SZ,-0.185929,-0.098732,0.184922,2634237.0,0.038043,39143100.0,-0.010132
1,2009-03-10,000001.SZ,-0.010132,0.099186,0.184922,2634237.0,0.038043,39143100.0,0.026351
2,2009-03-17,000001.SZ,0.026351,0.070034,0.184922,2634237.0,0.038043,39143100.0,-0.008496
3,2009-03-24,000001.SZ,-0.008496,-0.013845,0.184922,2634237.0,0.038043,39143100.0,-0.025068
4,2009-03-31,000001.SZ,-0.025068,0.01723,0.184922,4438662.0,0.038043,39143100.0,0.014501


Rank by volatility 

In [5]:
def Hard_to_Vale(proxy,df):

    #每周对股票按照Proxy 分成4组g_proxy_i(i=0,1,2,3)
    df_proxy=df[['Date','Code',proxy,'WOR','WOR_1']]
    df_proxy=df_proxy.sort_values(by=['Date',proxy]).reset_index(drop=True)
    df_proxy['g_proxy']=df_proxy.groupby('Date').apply(lambda x: partition(x,proxy, 4)).values 
    
    #在每个proxy每组内，按照WOR分为10组 g_wor_i(i=0,1,2,3,4,5,6,7,9)
    g_wor=pd.DataFrame(df_proxy.groupby(['Date','g_proxy']).apply(lambda x: partition(x,'WOR',10)))
    
    #proxy 和wor分组结果按照index 合并到原数据上
    g_wor=g_wor.reset_index(drop=True)
    g_wor.rename(columns={'WOR':'g_wor'},inplace=True)
    df_proxy=pd.concat([df_proxy,g_wor],axis=1)
    
    #保留g_wor_0和g_wor_9
    df_proxy=df_proxy[(df_proxy.g_wor==0 )| (df_proxy.g_wor==9)].reset_index(drop=True)
    df_proxy.drop([proxy,'WOR'],axis=1,inplace=True)
    
    #g_proxy_i组内,g_wor_0和g_wor_9的平均收益率差
    table4_proxy=df_proxy.sort_values(by=['g_proxy','g_wor']).groupby(['g_proxy','g_wor']).mean()


    #计算第0组(g_wor_0)和第组收益率(g_wor_9)差值
    table4_proxy['diff']=table4_proxy.groupby('g_proxy').diff().values

    #t test on difference between g_wor_o and g_wor_9 in each g_proxy_i
    t_val=[]
    for i in [0,1,2,3]:
        a1=df_proxy[(df_proxy.g_proxy==i)&(df_proxy.g_wor==0)].WOR_1.dropna()
        a2=df_proxy[(df_proxy.g_proxy==i)&(df_proxy.g_wor==9)].WOR_1.dropna()
        t_val.extend([np.nan,round(ttest(a1,a2))])
    table4_proxy['t']=t_val
    
    #t test on  wor difference of g_proxy_0 and g_proxy_3
    a1=df_proxy[df_proxy.g_proxy==0].WOR_1.dropna()
    a2=df_proxy[df_proxy.g_proxy==3].WOR_1.dropna()
    t41=ttest(a1,a2)

    return (table4_proxy,t41)

In [6]:
table4_a=[]
t41=[]
proxy=['Vol','Size','Profitability','EPR']
for i in proxy:
    table4_a.append(Hard_to_Vale(i,df)[0]) 
    t41.append(Hard_to_Vale(i,df)[1]) # proxy 第0组和第4组wor差值的比较和对应的t检验
    Hard_to_Vale(i,df)
    
table4_a=pd.concat(table4_a,axis=1)
columns=[['Vol','Vol','Vol','Size','Size','Size','Profit','Profit','Profit','EPR','EPR','EPR'],
         ['WOR_1', 'diff', 't','WOR_1', 'diff', 't','WOR_1', 'diff', 't','WOR_1', 'diff', 't']]
table4_a.columns=columns

display(table4_a)

Unnamed: 0_level_0,Unnamed: 1_level_0,Vol,Vol,Vol,Size,Size,Size,Profit,Profit,Profit,EPR,EPR,EPR
Unnamed: 0_level_1,Unnamed: 1_level_1,WOR_1,diff,t,WOR_1,diff,t,WOR_1,diff,t,WOR_1,diff,t
g_proxy,g_wor,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
0,0.0,-0.012954,,,-0.01734,,,-0.008214,,,-0.020039,,
0,9.0,-0.00252,0.010434,-24.0,-0.002336,0.015003,-26.0,-0.007248,0.000966,-1.0,-0.003145,0.016894,-29.0
1,0.0,-0.013994,,,-0.016796,,,-0.006363,,,-0.015823,,
1,9.0,-0.003754,0.01024,-23.0,-0.005057,0.011739,-24.0,-0.006713,-0.000349,1.0,-0.004458,0.011366,-22.0
2,0.0,-0.015229,,,-0.014053,,,-0.005884,,,-0.012609,,
2,9.0,-0.004265,0.010964,-23.0,-0.004854,0.009198,-21.0,-0.004532,0.001352,-3.0,-0.004637,0.007971,-19.0
3,0.0,-0.017772,,,-0.011258,,,-0.005157,,,-0.009785,,
3,9.0,-0.005024,0.012749,-24.0,-0.003043,0.008215,-21.0,-0.00425,0.000906,-2.0,-0.00334,0.006445,-19.0


In [7]:
diff41=table4_a.iloc[table4_a.index.get_level_values('g_wor') == 9.0]
table4_b=diff41.iloc[[0,3],[1,4,7,10]].diff().dropna().reset_index(drop=True)
columns=['Vol_diff','Size_diff','Profit_diff','EPR_diff']
table4_b.columns=columns
table4_b.loc['t']=t41
print('the difference between g_proxy_0 and g_proxy_3')
display(table4_b)

the difference between g_proxy_0 and g_proxy_3


Unnamed: 0,Vol_diff,Size_diff,Profit_diff,EPR_diff
0,0.002315,-0.006789,-6e-05,-0.01045
t,10.748964,-7.948563,-7.730696,-15.077396


**结论**

1. **波动率**: 随着波动率增加，WOR差值从0.010434增加到0.012749，差值显著(t=10.76)，说明越难估值，WOR越高。<br>


2. **Size** : 随着市值增加，WOR差值从0.015003	降低到0.008215，差值显著（t=-7.94)，符合预期；需要说明的一点是，本文按照数值大小进行排序，size第0组且市值最小组，为最难估值组，size第3组为市值最大组，为最易估值组，所以虽然符号和波动率相反，但是结论是一致的，即越难估值，WOR越低。


3. **收益**:随着公司收益增加，WOR从0.000966降低到0.000906，虽然两组相差很小，但是差值显著（t=-7.73），说明越难估值（收益率越低），WOR月高。但是与其他三组不同的是，在收益第0组和第1组（g_proxy_0，g_proxy_1)内，WOR差值不显著(t=-1,t=1),这和前文table1中分析一致，不同组之间的收益没有表现出明显的特征。


4. **EPR**，随着公司EPR增加，WOR从0.016894降低到0.006445，两组相差很大，差值显著（t=-15.08），说明EPR低，WOR高。EPR低说明P/E高，为成长股，成长股比价值股难估值，因此公司越难估值，WOR越低。

本文假设WOR可以表示情绪，则难估值的公司受到投资者情绪影响大，WOR较大。本文通过四个角度来衡量（波动率，市值，收益，EPR）公司的估值难度，证明了高波动率、低市值、低收益率和低EPR的公司，WOR大，从估值角度证明WOR可以表示投资者情绪。