In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
import warnings
import copy
import os
warnings.filterwarnings("ignore")
from sys import getsizeof

# TODO: get tradedates
trade_dates = ...

In [2]:
def get_window(data_dir: str,
               trade_dates: np.ndarray,
               time_window: int,
               current_index: int = 0,
               how:str = 'inner'
              ) -> pd.DataFrame:
    """
    从硬盘中整合eps的数据，读入内存。
    输入参数为交易起始日，和时间窗口长度，
    输出参数为一个pandas的df
    
    这个函数只在第一次调用的时候用到，读出的表格已经写入新的CSV文件，
    之后的硬盘读取不再需要这个函数，而是直接使用read_csv()方法来完成。
    """
    # FIXME: change the way to read data
    file_name = data_dir + 'eps_{}.csv'.format(trade_dates[current_index])
    df = pd.read_csv(file_name, index_col=1).loc[:]['eps']

    for i in range(1, time_window):

        file_name = data_dir + 'eps_{}.csv'.format(trade_dates[current_index + i])
        new_df = pd.read_csv(file_name, index_col=1).iloc[:]['eps']
        df = pd.concat([df, new_df], join=how, axis=1)  # 取交集

    df.columns = trade_dates[current_index:current_index + time_window]
    
    return df

In [3]:
def get_frame(data:pd.DataFrame,
              time_window:int,
             index_num:int,
             direction:str)->pd.DataFrame:
    """
    从内存中的数据框读取合适大小的数据用于计算
    """
    
    index_num += 1
    if direction == 'forward':
        df = data.iloc[:,index_num : index_num + time_window].dropna()
    elif direction == 'backward':
        df = data.iloc[:,index_num - time_window: index_num].dropna()
    
    return df

In [4]:
def z_score(X: pd.DataFrame)->pd.DataFrame:
    display(X)
    return (X - np.mean(X)) / np.std(X)

In [5]:
def find_trade_day(trade_dates:np.ndarray, day:int, direction:str = 'backward'):
    """
    将输入的随便的数转化成列表中有的交易日
    """
    if day in trade_dates: # 如果本来就在列表当中
        nearest_trade_day = day
    elif direction == 'backward':
        nearest_trade_day = trade_dates[max((0, np.argmax(trade_dates - day >0) - 1))]
    elif direction == 'forward':
        nearest_trade_day = trade_dates[np.argmax(trade_dates - day >0)]
        
    return nearest_trade_day

In [17]:
def cal_strategy(period, step, start_day_index, skip_first_day,
                 get_Xt, #关键的获得信号的函数
                )-> tuple:
    """
    将权重函数（信号）和未来的五日收益率相乘的到策略的未来收益率
    ## 注意，这里的列名是指利用这一天及以前（含这一天）的数据对未来收益率做的预测。
    """
    

    daily_return = copy.deepcopy({'all':[],'top':[],'bottom':[]})
    daily_signal = copy.deepcopy({'all':[],'top':[],'bottom':[]})
    # 遍历我们要计算的每一天
    for i in range(0,period,step): 
        
        current_day_index =  start_day_index + i # 找到现在的交易日在交易日列表中的索引
        current_day = trade_dates[current_day_index]

        # 获取用于计算的数据框
        
#         df_price = get_frame(data=total_price,
#                              time_window=train_window,
#                              index_num = current_day_index,
#                              direction='backward') 
        
        df_return = get_frame(data=total_return,
                             time_window=train_window, 
                             index_num = current_day_index,
                             direction='backward') 
        
        #  将数据框传到计算信号的函数当中。注意生成交易信号的时候就要判断是否多空
#         X = baz_get_Xt(df_price, current_day) 
#         X = moskowitz_get_Xt(df_return, current_day)
        X = df_return.apply(get_Xt,axis=1).sort_values(ascending=False)
    
#         取与股票池的交集
        
        current_target_stocks = target_stock.loc[:][current_day].dropna()
        current_target_stocks = current_target_stocks.astype(int)
        X.index = X.index.astype(int)
        intersec_stock = pd.Series(list(set(X.index).intersection(set(current_target_stocks.values))))
        X = X.loc[intersec_stock][:]
        df_return=df_return.loc[intersec_stock][:]
        
        X = X.astype(float)
        
        X = (X - np.mean(X)) / np.std(X)  # 做 z-score
        X[X > 3] = 3
        X[X < -3]  = -3


        top =  X[X > 0].index
        bottom = X[X < 0].index

#         # 中位数分组
#         median = X.median()
#         top =  X[X > median].index
#         bottom = X[X < median].index

        df_X = pd.DataFrame(np.zeros((len(df_return.index),3)), # 先生成全 0 矩阵，方便后面的加法
                         columns=['all','top','bottom'], index=df_return.index)

        df_X.loc[top, 'top'] = X[top] / X[top].sum()
        df_X.loc[bottom ,'bottom'] = - X[bottom] / X[bottom].sum()
        df_X['all'] = df_X['top'] + df_X['bottom']
        df_X.replace(0, np.nan, inplace=True)
        X = df_X

        if skip_first_day is True: # 如果要跳过第一天的话
            current_day_index += 1
        
        # 获取用于计算未来收益率的数据框
        test_return = get_frame(data=total_return,
                               time_window=test_window,
                               index_num=current_day_index,
                               direction='forward')
        
        # 计算今日的收益率
        for column in X.columns:
            daily_signal[column].append(X[column])
            daily_return[column].append((test_return.sum(axis=1) * X[column]).dropna())
         
        

    
    # 将列表中的Series统一拼接
    strategy_return = copy.deepcopy({})
    strategy_signal = copy.deepcopy({})
    for key in daily_return.keys():
        strategy_return[key] = pd.concat(daily_return[key], axis=1, join='outer')
        strategy_signal[key] = pd.concat(daily_signal[key], axis=1, join='outer')
        
        strategy_return[key].columns= trade_dates[range(start_day_index,start_day_index + period, step)]
        strategy_signal[key].columns= trade_dates[range(start_day_index,start_day_index + period, step)]
        
    strategy_return['all'] = (strategy_return['top'].replace(np.nan,0)-strategy_return['bottom'].replace(np.nan,0)).replace(0,np.nan)
    strategy_return['bottom'] = -strategy_return['bottom']
    print(daily_return.keys())
    if note is not None:
        for key in daily_return.keys():
            strategy_return[key].to_csv(f'./data/strategy_return_{key}/' + note +'.csv',encoding='gbk',index=True)
            strategy_signal[key].to_csv(f'./data/strategy_signal_{key}/' + note +'.csv',encoding='gbk',index=True)
            header = not os.path.isfile('XSMOM_return.csv')
            daily_return = pd.DataFrame(strategy_return[key].sum(),columns=[note,]).T
            daily_return.to_csv('XSMOM_return.csv', header=header,index=True,encoding='gbk',mode='a')
    
    return strategy_return, strategy_signal

In [18]:
# 函数测试诊断
train_window = 5
time1 = time.time()
i = 0
time1 = time.time()

get_Xt = eval(info_df['函数程序名'][i])
note = None

main(get_Xt=get_Xt,
     start_day=20210101,
     end_day=20221231,
     skip_first_day=True,
     step=5,
     note=note,
     )

print(f'Calculation of No. {str(i)} named \'' + info_df.loc[i,'函数程序名'] + '\' has completed!')
time2 = time.time()
print(f'Total time cost = {(time2 - time1)/60 :.2f} mins.')

dict_keys(['all', 'top', 'bottom'])
Calculation of No. 0 named 'abs_energy' has completed!
Total time cost = 0.07 mins.


In [7]:
def show_strategy(strategy_return:dict,
                  strategy_signal:dict,
                  bench_mark:pd.Series,
                  long_short: str,
                  step: int = 5, 
                  note: str = '', 
                 ):
    """
    展示策略收益的函数，具有普遍适用性。
    这个函数需要绘制一些图片、计算一些数据，并存储到CSV文件当中
    具体而言，需要计算的比例有：
    （全策略、纯空头、纯多头的）胜率、回测收益、
    """
    daily_return = copy.deepcopy({})
    backtest_params = copy.deepcopy({})
    cumulative_return = copy.deepcopy({})
    drawdowns = copy.deepcopy({})
    tov_seq = copy.deepcopy({})
    
    for key in strategy_return.keys(): # 遍历每种组合的情况
        
        # 计算组合的日收益率、累计收益率序列
        daily_return[key] = strategy_return[key].sum()

        
        cumulative_return[key] = daily_return[key].cumsum()


        # 计算日均收益率、日均波动率、IR、下行偏差DD
        backtest_params[key + '_' + '日均收益率R'] = daily_return[key].mean() / step
        backtest_params[key + '_' + '日均波动率Vol'] = daily_return[key].std() / np.sqrt(step)
        
        backtest_params[key + '_' + '年化收益率ER'] = backtest_params[key + '_' + '日均收益率R'] * 252
        backtest_params[key + '_' + '年化波动率VOL'] = backtest_params[key + '_' + '日均波动率Vol'] * np.sqrt(252)
        
        backtest_params[key + '_' + '信息比率IR'] = backtest_params[key + '_' + '日均收益率R'] / backtest_params[key + '_' + '日均波动率Vol']
        backtest_params[key + '_' + '夏普比率SR'] = backtest_params[key + '_' + '信息比率IR'] * np.sqrt(252)

        backtest_params[key + '_' + '下行偏差DD'] = daily_return[key][daily_return[key] < 0].std()
        
        # 计算胜率：取每日胜率的平均值
        ve_seq = np.count_nonzero(strategy_return[key] > 0, axis = 0) / strategy_return[key].notna().sum(axis=0)
        backtest_params[key + '_' + '胜率VE'] = ve_seq.mean()
    
        # 计算盈亏比：取每日盈亏比的平均值
        gain = strategy_return[key][strategy_return[key] > 0].mean(axis=0)
        loss = strategy_return[key][strategy_return[key] < 0].mean(axis=0)
        backtest_params[key + '_' + '盈亏比PnL'] = (gain / abs(loss)).mean()
        
        # 计算最大回撤
        max_so_far = cumulative_return[key].values[0]
        drawdowns[key] = []
        for trade_day in cumulative_return[key].index:
            
#             display(cumulative_return[key])
#             print(key)
#             print(trade_day)
            if cumulative_return[key][trade_day] > max_so_far:
                drawdown = 0
                drawdowns[key].append(drawdown)
                max_so_far = cumulative_return[key][trade_day]
            else:
                drawdown =  max_so_far - cumulative_return[key][trade_day]
                drawdowns[key].append(drawdown)
            
        
        backtest_params[key + '_' + '最大回撤MDD'] = max(drawdowns[key])
        
        # 计算Calmar比率、Sortino比率
        backtest_params[key + '_' + '卡玛比率Calmar'] = backtest_params[key + '_' + '年化收益率ER'] / backtest_params[key + '_' + '最大回撤MDD']
        backtest_params[key + '_' + '索提诺比率Sortino'] = backtest_params[key + '_' + '年化收益率ER'] / backtest_params[key + '_' + '下行偏差DD']
        
        # 计算换手率
#         daily_holding = strategy_signal[key] / strategy_signal[key].notna().sum(axis=0)# 先将交易信号转化为持仓数
        daily_holding = strategy_signal[key]
        prior = daily_holding.iloc[:, :daily_holding.shape[1] - 1].fillna(0)
        rear = daily_holding.iloc[:,1:daily_holding.shape[1]].fillna(0)
        
        prior.columns, rear.columns = range(daily_holding.shape[1] - 1),range(daily_holding.shape[1] - 1) # 为了df可以准确做减法，需要修改对齐列名
        
        tov_seq[key] = (rear - prior).abs().sum(axis=0) / 2
        tov_seq[key] = pd.concat([pd.Series(.5), tov_seq[key]])
        backtest_params[key + '_' + '换手率Tov'] = tov_seq[key].mean()
        
    df = pd.DataFrame.from_dict(backtest_params,orient='index')

    
    # 修改显示方式
    names = np.array(df.index).reshape((-1,3),order='F').reshape((1,-1),order='C').tolist()[0]
    df = df.loc[names,:]

    

    '''    
        # 绘制回测收益率曲线、当日回撤柱状图、换手率柱状图
        x = pd.to_datetime(daily_return[long_short].index,format='%Y%m%d')
        time_str = time.strftime('%Y%m%d_%H%M%S',time.localtime())


        plt.figure(figsize=(16,5)) 

        plt.plot(x, cumulative_return['all'] * 100, color='r')
        plt.plot(x, cumulative_return['top'] * 100, color='g')
        plt.plot(x, cumulative_return['bottom'] * 100, color='b')

        plt.plot(x, bench_mark * 100, color='k')

        plt.legend(['Strategy_all','Strategy_top','Strategy_bottom','Benchmark'])
        plt.xlabel('Date')
        plt.ylabel('Cumulative Return %')
        plt.title('CUMULATIVE RETURN', )
        if note is not None: 
            note = time_str + '_' + note
            plt.savefig('./XSMOM_image/' + note + '.png') # 保存图片


        plt.figure(figsize=(16,6))
        plt.subplot(211)
        plt.bar(x, - np.array(drawdowns[long_short]) * 100,)
        plt.ylabel('Max Drawdown %')
        plt.title('DRAWDOWN')

        plt.subplot(212)
        tov_show = tov_seq[long_short]
        tov_show[0] = 0
        plt.bar(x,tov_show * 100)

        plt.ylabel('Turnover Rate %')
        plt.xlabel('Date')
        plt.title('TURNOVER RATE')
    '''
    
    # 写入CSV文件。
    if note is None: 
        return
    df_log = df.T
    
    df_log.index = [note,]
    header = not os.path.isfile('XSMOM_report.csv') # 如果文件存在，则不要写入表头。如果文件不存在则写入表头
    df_log.to_csv('XSMOM_report.csv', mode='a', index=True, header=header, encoding='gbk')
    
    return 

In [8]:
def main(get_Xt, # 这是需要的输出信号的名称
         start_day:int = 20210101,
         end_day:int = 20221231,
         skip_first_day:bool = True,
         step:int = 5,
         long_short: str = 'all', # 限制字段，可选'none','short','long','all',默认为'all'，这个字段仅用作画图。而所有的数据都会计算
         note: str = '', # 记录调参细节等内容的字段，将写入最后的日志文件
         read: bool = False,
        ):
    """
    回测的主函数。
    需要实现的功能有：
    1. 初始化
    2. 运行回测收益并存储数据
    3. 计算回测结果、相应收益指标等元素，并存储到csv文件中
    """
    
    # 初始化部分
    
    
    start_day = find_trade_day(trade_dates, start_day, 'forward')
    start_day_index = int(np.where(trade_dates == start_day)[0])
    end_day = find_trade_day(trade_dates, end_day, 'backward')
    end_day_index = int(np.where(trade_dates == end_day)[0])
    period = end_day_index - start_day_index
    
    bench_mark = zz500.iloc[start_day_index:end_day_index].cumsum().iloc[range(0,period,step)]
    if read is True and note is not None:
        strategy_return = copy.deepcopy({})
        strategy_signal = copy.deepcopy({})
        for key in ['all', 'top', 'short']:
            
            strategy_return[key] = pd.read_csv(f'./data/strategy_return_{key}/' + note +'.csv',encoding='gbk',index_col=0)
            strategy_signal[key] = pd.read_csv(f'./data/strategy_signal_{key}/' + note +'.csv',encoding='gbk',index_col=0)
            
    else:
        strategy_return, strategy_signal = cal_strategy(period, step, start_day_index, skip_first_day,get_Xt) ###
    
    show_strategy(strategy_return, strategy_signal, bench_mark, long_short, step, note)
    
    return 0

In [9]:
# 数据读取准备工作
data_dir = '../华安证券深度神经网络改进时间序列动量策略/baz_TSMOM/'

trade_dates = np.load(data_dir + 'tradedates.npy')
trade_dates = trade_dates[(trade_dates>=20170103) & (trade_dates <= 20230500)]
train_window =  5
test_window = 5


# total_day = trade_dates.shape[0] # 文件中拥有的全部天数
# total_return = get_window(data_dir=data_dir,
#                           trade_dates=trade_dates,
#                           time_window=total_day, # 一次全部读出
#                           current_index=0,
#                           how='outer')
# display(total_return)
# print(f'the size of train_return is {getsizeof(total_return) / (1024 * 1024):4.2f} MB.')
# total_price = total_return.cumsum(axis=1)
# display(total_price)
# total_return.to_csv('total_return.csv')
# total_price.to_csv('total_price.csv')

total_price=pd.read_csv(data_dir + 'total_price.csv',index_col=0)
total_return = pd.read_csv(data_dir + 'total_return.csv' ,index_col=0)
zz500 = pd.read_csv(data_dir + 'ZZ500.csv',index_col=0)
target_stock = pd.read_csv(data_dir + 'target_stock.csv',index_col=0)

total_price.columns = pd.to_numeric(total_price.columns) # 方便后文的整数索引
total_return.columns = pd.to_numeric(total_return.columns)
target_stock.columns = pd.to_numeric(target_stock.columns)
zz500 = zz500.loc[trade_dates]

display(zz500)
display(total_price)
display(total_return)
display(target_stock)

Unnamed: 0_level_0,ZZ500
dt,Unnamed: 1_level_1
20170103,0.009122
20170104,0.011693
20170105,0.000722
20170106,-0.004323
20170109,0.006874
...,...
20230424,-0.006682
20230425,-0.014597
20230426,0.003923
20230427,0.002569


Unnamed: 0_level_0,20170103,20170104,20170105,20170106,20170109,20170110,20170111,20170112,20170113,20170116,...,20230420,20230421,20230424,20230425,20230426,20230427,20230428,20230504,20230505,20230508
CodeInt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,-0.003937,-0.005309,-0.002760,-0.004754,-0.009661,-0.009346,-0.002768,0.003897,-0.010721,-0.039373,...,0.011273,0.007321,-0.014180,0.003221,-0.016329,-0.016141,0.010179,0.023087,0.035650,0.037762
2.0,0.000711,-0.005624,-0.000174,0.006677,0.005278,-0.004972,-0.007031,0.001514,0.069572,0.034962,...,0.220414,0.217020,0.203900,0.204323,0.181784,0.178815,0.196727,0.181249,0.214145,0.199537
4.0,-0.012706,-0.021648,-0.026045,-0.030087,-0.056350,-0.040720,-0.045468,-0.047901,-0.052015,-0.062331,...,0.633625,0.618704,0.650342,0.635326,0.616098,0.644642,0.630193,0.676022,0.703101,0.671319
5.0,0.006572,0.020866,0.019645,0.028747,0.031316,0.030033,0.019092,0.015007,0.012857,0.023810,...,-0.460491,-0.459247,-0.480376,-0.463113,-0.471025,-0.489864,-0.483946,-0.541763,-0.553666,-0.595196
6.0,0.010616,0.010543,0.010402,0.033770,0.017014,0.000989,0.008082,-0.021703,-0.055263,-0.067230,...,-0.249777,-0.257375,-0.267241,-0.274677,-0.279480,-0.283479,-0.282154,-0.292470,-0.274997,-0.291191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301387.0,,,,,,,,,,,...,-0.091205,-0.110193,-0.157357,-0.166558,-0.168862,-0.161303,-0.156582,-0.122120,-0.145462,-0.140088
603137.0,,,,,,,,,,,...,-0.154557,-0.230836,-0.222200,-0.247647,-0.268766,-0.261459,-0.239417,-0.229618,-0.232630,-0.218486
301307.0,,,,,,,,,,,...,,,,-0.016043,0.000387,0.061570,0.017362,0.003487,-0.054162,-0.043375
301360.0,,,,,,,,,,,...,,,,,0.037177,0.023285,0.074572,0.035899,-0.052570,0.047279


Unnamed: 0_level_0,20170103,20170104,20170105,20170106,20170109,20170110,20170111,20170112,20170113,20170116,...,20230420,20230421,20230424,20230425,20230426,20230427,20230428,20230504,20230505,20230508
CodeInt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,-0.003937,-0.001372,0.002549,-0.001994,-0.004906,0.000314,0.006578,0.006665,-0.014618,-0.028652,...,0.003602,-0.003953,-0.021501,0.017401,-0.019549,0.000188,0.026320,0.012908,0.012563,0.002112
2.0,0.000711,-0.006336,0.005450,0.006851,-0.001399,-0.010250,-0.002059,0.008545,0.068058,-0.034610,...,0.009222,-0.003394,-0.013119,0.000423,-0.022540,-0.002969,0.017913,-0.015478,0.032895,-0.014607
4.0,-0.012706,-0.008942,-0.004397,-0.004043,-0.026262,0.015629,-0.004748,-0.002432,-0.004115,-0.010316,...,0.008010,-0.014921,0.031638,-0.015016,-0.019228,0.028544,-0.014449,0.045829,0.027079,-0.031783
5.0,0.006572,0.014294,-0.001220,0.009102,0.002568,-0.001283,-0.010941,-0.004085,-0.002150,0.010953,...,-0.003068,0.001244,-0.021130,0.017263,-0.007911,-0.018840,0.005918,-0.057817,-0.011903,-0.041529
6.0,0.010616,-0.000073,-0.000141,0.023368,-0.016755,-0.016026,0.007093,-0.029784,-0.033560,-0.011968,...,-0.007278,-0.007598,-0.009866,-0.007437,-0.004803,-0.003999,0.001325,-0.010316,0.017473,-0.016194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301387.0,,,,,,,,,,,...,-0.014547,-0.018988,-0.047164,-0.009201,-0.002304,0.007560,0.004721,0.034462,-0.023341,0.005374
603137.0,,,,,,,,,,,...,-0.154557,-0.076278,0.008636,-0.025447,-0.021118,0.007307,0.022041,0.009799,-0.003012,0.014144
301307.0,,,,,,,,,,,...,,,,-0.016043,0.016430,0.061183,-0.044208,-0.013875,-0.057650,0.010787
301360.0,,,,,,,,,,,...,,,,,0.037177,-0.013891,0.051287,-0.038673,-0.088469,0.099848


Unnamed: 0,20170103,20170104,20170105,20170106,20170109,20170110,20170111,20170112,20170113,20170116,...,20230417,20230418,20230419,20230420,20230421,20230424,20230425,20230426,20230427,20230428
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,,...,,,,,,2.0,2.0,2.0,2.0,2.0
1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,,,,...,,,,,,,,,,
2,34.0,34.0,34.0,34.0,34.0,34.0,34.0,,,,...,,,,,,,,,,
3,59.0,59.0,59.0,59.0,59.0,59.0,59.0,,,,...,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,
4,63.0,63.0,,,,,,63.0,63.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2882,,,,,,,,,,,...,,,,,,,,,,
2883,,,,,,,,,,,...,28.0,28.0,28.0,28.0,28.0,28.0,,,28.0,28.0
2884,,,,,,,,,,,...,,,,,,,,,,
2885,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# non_param_funcs =pd.read_excel('全函数结果输出汇总_缺失值删除.xlsx', sheet_name='无参函数',usecols=range(4))
# with_param_funcs =pd.read_excel('全函数结果输出汇总_缺失值删除.xlsx', sheet_name='有参函数',usecols=range(4))
# non_param_funcs.replace('\\', np.nan, inplace=True)
# with_param_funcs.replace('\\',np.nan, inplace=True)
# non_param_funcs.dropna(inplace=True)
# with_param_funcs.dropna(inplace=True)
# non_param_funcs.reset_index(drop=True, inplace=True)
# with_param_funcs.reset_index(drop=True, inplace=True)
# non_param_funcs.to_csv('non_param_funcs.csv')
# with_param_funcs.to_csv('with_param_funcs.csv')

non_param_funcs = pd.read_csv('non_param_funcs.csv',index_col=0)
with_param_funcs= pd.read_csv('with_param_funcs.csv',index_col=0)
info_df = pd.concat([non_param_funcs,with_param_funcs],axis=0).reset_index(drop=True)
display(info_df)
info_df.to_csv('info_df.csv', encoding='gbk', index=False)

Unnamed: 0,函数名,函数程序名,运行速度（快：<10，慢：<15）,函数解释
0,abs_energy,abs_energy,快,时间序列的绝对能量
1,absolute_maximum,abs_max,快,最高绝对值
2,absolute_sum_of_changes,abs_sum_changes,快,一阶差分绝对和
3,count_above_mean,count_above_mean,快,高于均值个数占比
4,count_below_mean,count_below_mean,快,低于均值个数占比
5,first_location_of_maximum,first_loc_of_max,快,最大值位置
6,first_location_of_minimum,first_loc_of_min,快,最小值位置
7,kurtosis,kurtosis,慢,峰度
8,last_location_of_maximum,last_loc_of_max,快,最大值最近位置
9,last_location_of_minimum,last_loc_of_min,快,最小值最近位置


In [11]:
# 导入全体无参数的函数
from abs_energy.abs_energy_5 import *
from absolute_maximum.absolute_maximum_5 import *
from absolute_sum_of_changes.absolute_sum_of_changes_5 import *
from count_above_mean.count_above_mean_5 import *
from count_below_mean.count_below_mean_5 import *
from first_location_of_maximum.first_location_of_maximum_5 import *
from first_location_of_minimum.first_location_of_minimum_5 import *
from kurtosis.kurtosis_5 import *
from last_location_of_maximum.last_location_of_maximum_5 import *
from last_location_of_minimum.last_location_of_minimum_5 import *
from longest_strike_above_mean.longest_strike_above_mean_5 import *
from longest_strike_below_mean.longest_strike_below_mean_5 import *
from maximum.maximum_5 import *
from mean.mean_5 import *
from mean_abs_change.mean_abs_change_5 import *
from mean_change.mean_change_5 import *
from mean_second_derivative_central.mean_second_derivative_central_5 import *
from median.median_5 import *
from minimum.minimum_5 import *
from root_mean_square.root_mean_square_5 import *
from skewness.skewness_5 import *
from standard_deviation.standard_deviation_5 import *
from sum_values.sum_values_5 import *
from variance.variance_5 import *
from variation_coefficient.variation_coefficient_5 import *
from benford_correlation.benford_correlation_5 import *

In [12]:
# 导入全体有参数的函数
from agg_autocorrelation.agg_autocorrelation_5 import *
from ar_coefficient.ar_coefficient_5 import *
from augmented_dickey_fuller.augmented_dickey_fuller_5 import *
from autocorrelation.autocorrelation_5 import *
from count_above.count_above_5 import *
from count_below.count_below_5 import *
from index_mass_quantile.index_mass_quantile_5 import *
from large_standard_deviation.large_standard_deviation_5 import *
from linear_trend.linear_trend_5 import *
from mean_n_absolute_max.mean_n_absolute_max_5 import *
from number_crossing_m.number_crossing_m_5 import *
from number_peaks.number_peaks_5 import *
from quantile.quantile_5 import *
from range_count.range_count_5 import *
from ratio_beyond_r_sigma.ratio_beyond_r_sigma_5 import *
from agg_linear_trend.agg_linear_trend_5 import *
from binned_entropy.binned_entropy_5 import *
from c3.c3_5 import *
from change_quantiles.change_quantiles_5 import *
from cid_ce.cid_ce_5 import *
from cwt_coefficients.cwt_coefficients_5 import *
from energy_ratio_by_chunks.energy_ratio_by_chunks_5 import *
from fft_aggregated.fft_aggregated_5 import *
from fft_coefficient.fft_coefficient_5 import *
from fourier_entropy.fourier_entropy_5 import *
from friedrich_coefficients.friedrich_coefficients_5 import *
from lempel_ziv_complexity.lempel_ziv_complexity_5 import *
from number_cwt_peaks.number_cwt_peaks_5 import *
from partial_autocorrelation.partial_autocorrelation_5 import *
from permutation_entropy.permutation_entropy_5 import *
from spkt_welch_density.spkt_welch_density_5 import *
from time_reversal_asymmetry_statistic.time_reversal_asymmetry_statistic_5 import *

In [13]:
# 主循环遍历
for train_window in [5, 10, 22, 60, 126, 252]:
    time1 = time.time()
    for i in range(info_df.shape[0]):
        get_Xt = eval(info_df['函数程序名'][i])
        note = '_'.join(('SP', str(train_window)+'天信号', info_df.loc[i,'函数程序名'], info_df.loc[i,'函数解释']))

        main(get_Xt=get_Xt,
             start_day=20210101,
             end_day=20221231,
             skip_first_day=True,
             step=5,
             note=note,
             )
        print('\r',f'Calculation of No. {str(i)} named \'' + info_df.loc[i,'函数程序名'] + '\' has completed!',
             end='', flush=True)
    time2 = time.time()
    print(f'Total time cost = {(time2 - time1)/60 :.2f} mins.')

 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 32.14 mins.
 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 33.37 mins.
 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 36.81 mins.
 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 42.80 mins.
 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 52.76 mins.
 Calculation of No. 57 named 'ti_rev_asym_stat' has completed!!!Total time cost = 72.16 mins.


In [43]:
XSMOM_return = pd.read_csv('XSMOM_return.csv',encoding='gbk',index_col=0).T
display(XSMOM_return)

Unnamed: 0,SP_5天信号_abs_energy_时间序列的绝对能量,SP_5天信号_abs_energy_时间序列的绝对能量.1,SP_5天信号_abs_energy_时间序列的绝对能量.2,SP_5天信号_abs_max_最高绝对值,SP_5天信号_abs_max_最高绝对值.1,SP_5天信号_abs_max_最高绝对值.2,SP_5天信号_abs_sum_changes_一阶差分绝对和,SP_5天信号_abs_sum_changes_一阶差分绝对和.1,SP_5天信号_abs_sum_changes_一阶差分绝对和.2,SP_5天信号_count_above_mean_高于均值个数占比,...,SP_252天信号_partial_autocorr_给定滞后结束的偏相关函数的值,SP_252天信号_permu_entropy_排列熵,SP_252天信号_permu_entropy_排列熵.1,SP_252天信号_permu_entropy_排列熵.2,SP_252天信号_spkt_welch_dens_不同频率的功率谱,SP_252天信号_spkt_welch_dens_不同频率的功率谱.1,SP_252天信号_spkt_welch_dens_不同频率的功率谱.2,SP_252天信号_ti_rev_asym_stat_时间反转不对称统计量 (time reversal asymmetry statistic.),SP_252天信号_ti_rev_asym_stat_时间反转不对称统计量 (time reversal asymmetry statistic.).1,SP_252天信号_ti_rev_asym_stat_时间反转不对称统计量 (time reversal asymmetry statistic.).2
20210104,0.013034,0.005940,0.006128,0.007782,0.003849,0.004844,0.011119,0.002685,0.005776,0.014554,...,0.003593,0.015638,0.002037,0.009223,0.015105,0.002797,0.007054,0.019100,0.022705,0.002879
20210111,0.017120,0.011706,0.004601,0.013350,0.012651,0.003827,0.009095,0.010206,0.003078,0.005146,...,-0.004904,0.009713,0.003492,0.018533,0.009201,0.012104,-0.006475,0.013479,0.020098,0.005997
20210118,-0.006886,-0.001885,-0.002479,-0.000437,0.002173,-0.002997,-0.005950,-0.002235,-0.004022,-0.004533,...,-0.003625,-0.004570,-0.000223,-0.000568,0.008787,0.003706,0.003343,-0.003835,-0.003358,0.002575
20210125,0.022679,0.023081,-0.000313,0.023516,0.023450,0.000066,0.025279,0.027029,-0.002410,0.018058,...,-0.003995,0.013648,0.007983,0.007761,0.023053,0.028474,0.006315,0.009568,0.026833,0.006699
20210201,0.016840,0.008607,0.008505,0.022112,0.011837,0.008112,0.012880,0.002824,0.008358,0.018664,...,0.013972,0.009849,0.008912,0.005885,0.001421,-0.004069,0.010522,-0.000774,0.002308,0.000835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20221128,0.016305,0.013134,0.004967,0.015265,0.010932,0.004958,0.012548,0.009500,0.003730,0.008156,...,0.003105,-0.002736,0.001631,0.001693,0.009434,0.008640,0.006613,0.003032,-0.007330,0.002400
20221205,0.028931,0.034713,-0.005493,0.022213,0.028078,-0.004939,0.015631,0.022468,-0.004175,0.006349,...,-0.002897,0.003975,-0.000077,0.006983,0.019466,0.017307,0.000195,0.014666,0.016189,-0.001503
20221212,0.012805,0.004002,0.009631,0.011972,0.001886,0.009890,0.013237,0.003849,0.009553,0.018051,...,0.007272,0.009823,0.009283,0.004954,0.028624,0.027394,0.006806,0.017576,0.014940,0.006709
20221219,0.016600,0.005101,0.008263,0.015521,0.005275,0.007962,0.007472,-0.004299,0.009550,0.018375,...,0.006056,0.004202,0.005774,0.004800,0.000483,-0.005969,0.006937,0.001699,-0.010083,0.007864


In [44]:
return_all = XSMOM_return.iloc[:, range(0,XSMOM_return.shape[1],3)]
return_top = XSMOM_return.iloc[:, range(1,XSMOM_return.shape[1],3)]
return_bottom = XSMOM_return.iloc[:, range(2,XSMOM_return.shape[1],3)]

return_all.to_csv('XSMOM_return_all.csv',encoding='gbk',index=True)
return_top.to_csv('XSMOM_return_top.csv',encoding='gbk',index=True)
return_bottom.to_csv('XSMOM_return_bottom.csv',encoding='gbk',index=True)

In [53]:
return_list = [return_all,return_top,return_bottom]
names = ['all', 'top', 'bottom']
for name, df in zip(names,return_list):
    
    plt.subplots(figsize = (120,120))
    sns.heatmap(df.corr(),annot = True,vmax = 1,square = True,cmap = "Reds")
    plt.savefig(name+'.png')

In [57]:
for df in return_list:
    display(df)

In [14]:
# 函数测试诊断
train_window = 60
time1 = time.time()
i = 51 # friedrich_coeff
time1 = time.time()

get_Xt = eval(info_df['函数程序名'][i])
note = None

main(get_Xt=get_Xt,
     start_day=20210101,
     end_day=20221231,
     skip_first_day=True,
     step=5,
     note=note,
     )

print(f'Calculation of No. {str(i)} named \'' + info_df.loc[i,'函数程序名'] + '\' has completed!')
time2 = time.time()
print(f'Total time cost = {(time2 - time1)/60 :.2f} mins.')

KeyboardInterrupt: 

In [38]:
a = np.random.randn(24) * 10
display(a)
count_above_mean(a)

array([  1.52501769,  -3.5565113 ,  -9.24051328,   8.9109812 ,
         4.18484228,   3.07310638, -21.85232327,   4.09307845,
        13.14155723, -15.34513936,  -2.90942924,   0.48003414,
        10.97633971,   1.90494727,  -9.64225301,  -7.08333847,
        13.90198031,  25.03413239,  13.78127537,   6.7053755 ,
        16.10028359,  14.06810234,   8.60768091, -15.71275661])

13

In [41]:
def zscore(X):
     return (X - np.mean(X)) / np.std(X)  # 做 z-score
a = [10,10,0,0,10]
b = [0, 0,10, 10, 0]
print(zscore(a))
print(zscore(b))

[ 0.81649658  0.81649658 -1.22474487 -1.22474487  0.81649658]
[-0.81649658 -0.81649658  1.22474487  1.22474487 -0.81649658]


In [None]:
s = pd.DataFrame([[np.nan,np.nan,1],[np.nan,np.nan,np.nan],[5,np.nan,np.nan,]])
display(s)
s.notna().sum(axis=0)

In [None]:
a = {'a_x':1, 
    'b_y':2,
    'c_z':3,
    'd_w':4,
    'e_u':5,
    'f_v':6}
b = pd.DataFrame.from_dict(a,orient='index')
names = np.array(b.index.values).reshape((-1,3),order='F').reshape((1,-1),order='C')
names.tolist()[0]
b.loc[names.tolist()[0],:]

In [None]:
a = np.random.randn(24) * 10
b = np.random.randn(24) * 10

df = pd.DataFrame([a, b]).T
display(df)
df.rolling(10).apply(linear_trend).astype(float).dropna()