In [2]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


/Users/caoyue/Codes/Python/Papy


In [3]:
import pandas as pd
import datetime
import numpy as np


In [3]:
# window length for backward and forward
backward_window = 60
forward_window = 5


In [4]:
store = pd.HDFStore('data/raw_data.h5')
ret_df: pd.DataFrame = store.get('ret_df').sort_index()
annodt_df: pd.DataFrame = store.get('annodt_df')
store.close()


In [5]:
# 为annodt_df 增加index 用于后面的合并；同时留下原时间列、增加一列滞后的列，用于接下来的判断。
annodt_df.set_index('Annodt', drop=False, append=True, inplace=True)
annodt_df.index.names = ['Stkcd', 'Trddt']
annodt_df['Annodt_lag'] = annodt_df['Annodt'] + datetime.timedelta(days=1)


In [6]:
# 合并收益数据和宣告日数据，同时为了按代码分组。
ret_df_with_annodt: pd.DataFrame = ret_df.join(
    annodt_df, on=['Stkcd', 'Trddt'], how='outer')
ret_df_grouped = ret_df_with_annodt.groupby(level='Stkcd')

# 删除宣发日和其后一日的行。逻辑为：交易日既不在宣发日，同时也不在后一日的list 里。
ret_df_without_annodt = ret_df_grouped.apply(
    lambda df: df.query('Trddt not in Annodt and Trddt not in Annodt_lag'))

# 通过reset_index() 来去掉分组。同时删掉不需要的宣发日期列。
ret_df_final: pd.DataFrame = ret_df_without_annodt.reset_index(
    level=0, drop=True).drop(
        labels=['Annodt', 'Annodt_lag'], axis=1)

#############################################################################
# add some column for future use


In [7]:
# 计算每组的组合权重dollar_volumn
ret_df_grouped = ret_df_final.groupby('Stkcd')
ret_df_final['dollar_volumn'] = ret_df_grouped.apply(
    lambda df: df['Clsprc'].shift() * df['Dnshrtrd']).reset_index(
        level=0, drop=True)


In [8]:
ret_df_final['log_ret'] = np.log1p(ret_df_final['Dretwd'])


In [9]:
ret_df_grouped = ret_df_final.groupby(level='Stkcd')
# 计算标准化收益率
normalied_df: pd.Series = ret_df_grouped['log_ret'].rolling(
    window=backward_window).apply(
        lambda window: (window[-1] - window.mean()) / window.std())
# 标准化收益率合并到原数据
normalied_df.reset_index(level=0, drop=True, inplace=True)
ret_df_final['Norm_ret'] = normalied_df

# 在最后清理空值，防止因为两列空值重叠时多删数据。
ret_df_final.dropna(subset=['Norm_ret', 'dollar_volumn'], inplace=True)

##############################################################################
# group the data.


  """


In [10]:
ret_df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Clsprc,Dnshrtrd,Dsmvosd,Dretwd,dollar_volumn,log_ret,Norm_ret
Stkcd,Trddt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2007-09-17,36.4,13577318.0,56426835.45,0.00055,493942800.0,0.00055,-0.141667
1,2007-09-18,35.3,19907099.0,54721628.88,-0.03022,724618400.0,-0.030686,-0.859069
1,2007-09-19,36.18,19033887.0,56085794.13,0.024929,671896200.0,0.024623,0.586278
1,2007-09-20,35.97,13596616.0,55760254.7,-0.005804,491925600.0,-0.005821,-0.198387
1,2007-09-21,36.16,15811498.0,56054790.38,0.005282,568739600.0,0.005268,0.08956


In [1]:
ret_df_grouped = ret_df_final.groupby(
    'Trddt', group_keys=False)
ret_df_final['cap_group'] = ret_df_grouped['Dsmvosd'].transform(
    lambda series: pd.qcut(series, q=5, labels=['Small', '2', '3', '4', 'Big'])
        )


NameError: name 'ret_df_final' is not defined

In [0]:
ret_df_final['ret_group'] = ret_df_grouped['Norm_ret'].transform(
    lambda series: pd.qcut(
        series, q=10,
        labels=['Lo', '2', '3', '4', '5', '6', '7', '8', '9', 'Hi'])
            )



In [0]:
def cumulative_ret(data_serie: pd.Series):
    '''
    data_serie: array like nums
    输入一列数字，计算累积收益率。如：
    输入[0.4, 0.3, 0.2]，返回（1 * 1.2 * 1.3 * 1.4 - 1）
    '''
    cumul_ret = 1
    for num in data_serie:
        cumul_ret = (1 + num) * cumul_ret
    return (cumul_ret - 1)


# cumulative_ret([0.4, 0.3, 0.2, 0.1])


In [0]:
# 将时间倒序，解决下一步rolling 对象没有向前rolling 的问题。
ret_df_grouped = ret_df_final.groupby('Stkcd')
tem = ret_df_grouped['Dretwd'].apply(
    lambda serie: serie.sort_index(level='Trddt', ascending=False).shift(2)
).reset_index(
    level=0, drop=True)

# 将上面倒叙过后的数据框，使用cumulative_ret 函数计算未来五天的累积收益率。最后把顺序转回来
ret_df_final['cum_ret'] = tem.groupby('Stkcd').rolling(forward_window).apply(
    cumulative_ret).reset_index(
        level=0, drop=True).sort_index(level=['Stkcd', 'Trddt'])
ret_df_final.dropna(subset=['cum_ret'], inplace=True)


In [0]:
# 计算每组按照dollar_volumn 加权得到的组合收益率
ret_df_grouped = ret_df_final.groupby(['Trddt', 'cap_group', 'ret_group'])
portfolie_ret_serie = ret_df_grouped.apply(
    lambda df: np.average(df['cum_ret'], weights=df['dollar_volumn']))


In [0]:
# 试用agg 函数。性能表现不好暂时放在这里。
# ret_df_grouped.agg({
#     'cum_ret':
#     lambda x: np.average(x, weights=ret_df_grouped['dollar_volumn'])
# })


In [0]:


# “输家-赢家”，计算反转收益的函数。
def reverse_port(serie: pd.Series):
    '''
    输入不同标准收益率分组的一列序列，输家-赢家获得反转组合收益
    serie: pd.Series 一列分组后的收益值，输家在前，赢家在后。
    '''
    result = []
    for index in range(5):
        high_group = serie.iloc[9 - index]
        low_group = serie.iloc[index]
        result.append(low_group - high_group)
    return result


# 按照日期和cap_group 分组后，每组内应用以上的函数求出反转收益。
portfolie_ret_grouped = portfolie_ret_serie.groupby(['Trddt', 'cap_group'])
reverse_ret_in_serie: pd.Series = portfolie_ret_grouped.apply(reverse_port)

# 上面输出一个serie of list，把它转化为一个数据框。
reverse_ret_each_day = pd.DataFrame(
    (item for item in reverse_ret_in_serie),
    index=reverse_ret_in_serie.index,
    columns=['Lo-Hi', '2-9', '3-8', '4-7', '5-6'])
reverse_ret_aver = reverse_ret_each_day.groupby('cap_group', sort=False).mean()

# 计算均值后cap_group 失去了Categories 类型，重新规定为CategoricalIndex 并排序
reverse_ret_aver.set_index(
    pd.CategoricalIndex(
        reverse_ret_aver.index, categories=['Small', '2', '3', '4', 'Big']),
    inplace=True)
reverse_ret_aver.sort_index()


In [0]:
# store = pd.HDFStore('data/reverse_portfolie.h5')
# key = 'reverse' + str(backward_window) + '_' + str(forward_window)
# store[key] = reverse_ret_aver
# # store.get('reverse20_5')
# store.close()
