环境设定

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.utils import resample
import datetime
from datetime import datetime

数据导入

In [2]:
# 基金数据导入
funds = pd.read_csv('funds_data.csv')
# Cahart模型因子数据导入
cahart = pd.read_csv('fivefactor_weekly.csv')

数据调整

In [3]:
# 时间类数据格式调整
funds['时间'] = pd.to_datetime(funds['时间'], format='%Y-%m-%d')
cahart['trdwk'] = pd.to_datetime(cahart['trdwk'], format='%Y-%m-%d')

In [4]:
# 基金回报率计算
funds['单位净值(初)']=funds.groupby('代码')['周单位净值(元)'].shift(1)
funds['净回报率'] = funds['周单位净值(元)']/funds['单位净值(初)'] - 1

In [5]:
# 取Cahart模型所需因子：市场风险因子（mkt_rf）、规模风险因子（smb）、账面市值比风险因子（hml）、\
#                    惯性/动量因子（umd）及无风险利率（rf）
c1 = cahart[(cahart['trdwk']>="2005-01-07")&(cahart['trdwk']<="2018-12-28")]
c2 = c1[['trdwk','mkt_rf','smb','hml','umd','rf']]
# 取基金净值有效时间段内数据
f = funds[(funds['时间']>='2005-01-07')&(funds['时间']<='2018-12-28')]
f = f.rename(columns = {'时间': 'trdwk'})
# 合并因子与基金净值
cmodel = pd.merge(f, c2, how='left', on='trdwk')

In [6]:
# 处理缺失值： 用上一个非缺失值填补
cmodel = cmodel.fillna(method='ffill')

In [7]:
# 调整合并后数据列名
cmodel = cmodel.drop(columns = ['周单位净值(元)', '单位净值(初)'])
cmodel = cmodel.rename(columns = {'代码': 'code', '简称': 'name', '净回报率': 'ri'})

In [8]:
cmodel.head()

Unnamed: 0,code,name,trdwk,ri,mkt_rf,smb,hml,umd,rf
0,510050.OF,50ETF,2005-01-07,-0.01506,-0.009569,0.016692,-0.0016,-0.026043,0.000428
1,510050.OF,50ETF,2005-01-14,0.002039,0.00233,0.006033,0.002732,-0.003396,0.000428
2,510050.OF,50ETF,2005-01-21,0.005086,-0.008801,0.004323,0.002299,-0.018932,0.000428
3,510050.OF,50ETF,2005-01-28,-0.012146,-0.022081,-0.017804,-0.002866,0.034744,0.000428
4,510050.OF,50ETF,2005-02-04,-0.105533,0.027939,-0.046164,0.001879,0.039884,0.000428


计算基金池内各基金真实Alpha的估计值并排序（使用Cahart四因子OLS回归模型）

In [9]:
funds_list = cmodel.code.unique() # 基金清单

In [10]:
alpha = []
ta = []

for fund in funds_list:
    tmp_f = cmodel[cmodel['code'] == fund].reset_index(drop=True)
    X = tmp_f.loc[:, 'mkt_rf':'umd']
    Y = tmp_f.ri - tmp_f.rf
    
    true_ols = sm.OLS(Y, sm.add_constant(X)).fit()
    alpha_i = true_ols.params[0]
    ta_i = true_ols.tvalues[0]
    
    alpha.append(alpha_i) # 每只基金的真实Alpha(使用OLS回归后的估计值)
    ta.append(ta_i) # Alpha估计值的t-value

In [11]:
print("alpha:", alpha[:5],"\n","ta:",ta[:5])

alpha: [0.0007632565614886969, -0.0008554388280237516, -0.0010060216375671872, 0.00578850129619422, 0.006753065490592524] 
 ta: [1.2051567717981015, -1.3967463701635694, -1.1266762499223177, 0.9316754271468579, 1.2491767356564019]


## Fama&French模型（6.18 - 6.24）

Fama&French模拟回报率伪时间序列的方式与Kosowsk模型略有不同，主要步骤如下；
1. 计算所有基金真实alpha的估计值(已计算)
2. 使用F&F公式计算单支基金的伪回报率；***伪回报率R`=真实回报率R - 真实alpha的估计值***
3. 对基金按月份分组，Bootstrap月数据，模拟伪回报率的时间序列，这种方法保留了周数据中基金组的截面数据
4. 对单支基金伪回报率的伪时间序列做Cahart OLS回归，得到单支基金伪alpha
5. 排序所有基金的伪alpha

In [12]:
funds_list
alpha = np.array(alpha).reshape(328,) # 调整alpha和ta的数列构造
ta = np.array(ta).reshape(328,)

# 转换成数据表格
df ={'code':funds_list, 'alpha':alpha, 'ta':ta }
df = pd.DataFrame(df)

# 与cahart四因子数据合并
fmodel = pd.merge(cmodel, df, on='code')

# 计算基金的伪回报率
fmodel['fk_r'] = fmodel['ri'] - fmodel['alpha']
fmodel.head(5)

Unnamed: 0,code,name,trdwk,ri,mkt_rf,smb,hml,umd,rf,alpha,ta,fk_r
0,510050.OF,50ETF,2005-01-07,-0.01506,-0.009569,0.016692,-0.0016,-0.026043,0.000428,0.000763,1.205157,-0.015823
1,510050.OF,50ETF,2005-01-14,0.002039,0.00233,0.006033,0.002732,-0.003396,0.000428,0.000763,1.205157,0.001275
2,510050.OF,50ETF,2005-01-21,0.005086,-0.008801,0.004323,0.002299,-0.018932,0.000428,0.000763,1.205157,0.004323
3,510050.OF,50ETF,2005-01-28,-0.012146,-0.022081,-0.017804,-0.002866,0.034744,0.000428,0.000763,1.205157,-0.012909
4,510050.OF,50ETF,2005-02-04,-0.105533,0.027939,-0.046164,0.001879,0.039884,0.000428,0.000763,1.205157,-0.106296


对基金按月份分组，Bootstrap月数据
1. 取基金代码、数据时间戳、伪回报列，按时间戳列出所有基金时间点上的伪回报（窄表转宽表）

In [13]:
def long_to_wide_f(fk_r_df):
    # 保留原始时序标签
    time = pd.DataFrame(fk_r_df['trdwk'].unique()).sort_values(by=0).reset_index(drop=True) 

    # 行-时间，列-所有基金，单元格表示一只基金在一个时间点的伪回报率（回报减去alpha）
    f_spl = fk_r_df.pivot_table(index='trdwk',
                                columns = 'code',
                                values = 'fk_r') 

    # 去掉宽表中的多层索引
    f_spl1 = np.array(f_spl)
    columns = f_spl.columns.values
    f_spl_df = pd.DataFrame(f_spl1, columns = columns).fillna(0) # 补足某些基金在早期时间点上的NaN
    
    return (f_spl_df,time)

In [14]:
fmodel_1 = fmodel[['trdwk','code','fk_r']]
f_spl_df = long_to_wide_f(fmodel_1)

In [15]:
f_spl_df[0].shape

(715, 328)

2. 按时间戳index进行抽样(已将时间戳由日期转为int1-713)

In [16]:
# 时间戳index抽样1000次（nb = 1000）
def f_sampling(f_spl_df):
    f_series=pd.Series(range(len(f_spl_df[0])))
    tseries = importr('tseries')
    #tsbootstrap = robjects.r['tsbootstrap']()
    spl_f = tseries.tsbootstrap(f_series, nb = 1000, m = 1)
    spl_f0 = np.array(spl_f) # (715,1000)
    spl_f1 = spl_f0.T # 转置为（1000，715）
    
    return spl_n1

3. 按每一次抽样时间戳制作伪时间序列，和真实时间轴上的Cahart四因子按按时间索引进行合并

In [17]:
# 工具1：宽表变窄表
def f_ols_caculator(df_wide, time, c_factors):
    
    # 宽表加上短时间格式的时间戳（由[0,715]变成['2005-1-7', '2018-12-31']）
    df_wide0 = pd.merge(df_wide,time,left_index = True,right_index=True).rename(columns = {0:'trdwk'})
    
    # 与cahart周频四因子数据合并
    c_factors.reset_index(drop=True)
    df_wide1 = pd.merge(df_wide0, c_factors, how='left', on='trdwk').fillna(method='ffill').set_index('trdwk')

    return df_wide1

In [18]:
# 工具2：alpha和t-value数列按alpha值排序
def series_manipulator(alpha, ta):
    # 将两个List合并到一起，按alpha排序
    alpha = np.array(alpha).reshape(len(alpha),1)
    ta = np.array(ta).reshape(len(ta),1)

    one_fund_array = np.hstack([alpha, ta])
    one_fund_array = one_fund_array[np.argsort(one_fund_array[:,0]),:]
    
    # 将排好序的alpha和其t-value分开，变成两个二维数组
    one_fund_list = np.hsplit(one_fund_array,2)
    
    return one_fund_list  

4. 对单支基金伪回报率的伪时间序列做Cahart OLS回归，得到单支基金伪alpha
5. 排序所有基金的伪alpha

In [147]:
f_fk_alpha_series = []
f_fk_ta_series = []

for i in spl_n1:
    f_spl_df_b = f_spl_df[0].iloc[i].reset_index(drop=True) # 按一次抽样时间戳序列排列各基金伪回报
    f_spl_df_long = wide_to_long_f(f_spl_df_b,f_spl_df[1]) # 工具1
    
    # 抽样完成后与真实时间轴上的Cahart四因子按按时间索引进行合并
    c_factors.reset_index(drop=True)
    f_spl_df1 = pd.merge(f_spl_df_b_long, c_factors, how='left', on='trdwk').fillna(method='ffill')
    
    # OLS回归计算伪alpha分布
    f_fk_alpha = []
    f_fk_ta = []
    for fund in funds_list:
        Y = f_spl_df_ols[fund] - f_spl_df_ols['rf']
        X = f_spl_df_ols.loc[:,'mkt_rf':'umd']
        fake_result = sm.OLS(Y, sm.add_constant(X)).fit()
        fk_alpha_i = fake_result.params[0]
        fk_ta_i = fake_result.tvalues[0]
    
        f_fk_alpha.append(fk_alpha_i)
        f_fk_ta.append(fk_ta_i)
    
    f_list = series_manipulator(f_fk_alpha, f_fk_ta) # 工具2
    
    f_fk_alpha_series.append(f_list[0]) # Exp：(328,1000)
    f_fk_ta_series.append(f_list[1]) # Exp: (328,1000)      

NameError: name 'spl_n1' is not defined

#### --------------------------------- Split Line ------------------------------------- ####

test_f0 = np.random.randint(0, 715, size=(1000, 715))
test_f0.shape

for i in test_f0[:1]:
    f_spl_df_b = f_spl_df[0].iloc[i].reset_index(drop=True)

time = f_spl_df[1]
f_spl_df_ols = f_ols_caculator(f_spl_df_b,time, c2)

f_fk_alpha = []
f_fk_ta = []
for fund in funds_list:
    Y = f_spl_df_ols[fund] - f_spl_df_ols['rf']
    X = f_spl_df_ols.loc[:,'mkt_rf':'umd']
    fake_result = sm.OLS(Y, sm.add_constant(X)).fit()
    fk_alpha_i = fake_result.params[0]
    fk_ta_i = fake_result.tvalues[0]
    
    f_fk_alpha.append(fk_alpha_i)
    f_fk_ta.append(fk_ta_i)
    
f_list = series_manipulator(f_fk_alpha, f_fk_ta)    

f_spl_df_ols

test_f0 = np.random.random((1000, 715))
test_f0[0].shape

np.random.seed(0)
np.random.random((2,3))

np.random.seed(0)
np.random.random((2,3)).T

