In [1]:
import pandas as pd
import tushare as ts
import numpy as np
token = 'your token'
ts.set_token(token)
pro = ts.pro_api(token)

In [2]:
df = pro.index_daily(ts_code='000001.SH',start_date = '20051210',end_date = '20220630')

In [3]:
df = df.set_index(pd.to_datetime(df['trade_date']))
df.sort_index(inplace = True)
df.drop(columns=['pre_close','change','pct_chg','amount'],inplace=True)

In [4]:
def cal_macd_system(data,short_,long_,m):
    '''
    data是包含高开低收成交量的标准dataframe
    short_,long_,m分别是macd的三个参数
    返回值是包含原始数据和diff,dea,macd三个列的dataframe
    '''
    data['diff']=data['Z_close'].ewm(adjust=False,alpha=2/(short_+1),ignore_na=True).mean()-\
                data['Z_close'].ewm(adjust=False,alpha=2/(long_+1),ignore_na=True).mean()
    data['dea']=data['diff'].ewm(adjust=False,alpha=2/(m+1),ignore_na=True).mean()
    data['macd']=2*(data['diff']-data['dea'])
    return data

In [5]:
def rsi(price, period=6):
    clprcChange = price - price.shift(1)
    clprcChange = clprcChange.dropna()

    indexprc = clprcChange.index
    upPrc = pd.Series(0, index=indexprc)
    upPrc[clprcChange > 0] = clprcChange[clprcChange > 0]

    downPrc = pd.Series(0, index=indexprc)
    downPrc[clprcChange < 0] = -clprcChange[clprcChange < 0]
    risdata = pd.concat([price, clprcChange, upPrc, downPrc], axis=1)
    risdata.columns = ['price', 'PrcChange', 'upPrc', 'downPrc']
    risdata = risdata.dropna()

    SMUP = []
    SMDOWN = []
    for i in range(period, len(upPrc) + 1):
        SMUP.append(np.mean(upPrc.values[(i - period): i], dtype=np.float32))
        SMDOWN.append(np.mean(downPrc.values[(i - period): i], dtype=np.float32))
        rsi = [100 * SMUP[i] / (SMUP[i] + SMDOWN[i]) for i in range(0, len(SMUP))]

    indexRsi = indexprc[(period - 1):]
    rsi = pd.Series(rsi, index=indexRsi)
    return rsi

In [6]:
# Z分数标准化
df['Z_close'] = (df['close']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
df['Z_open'] = (df['open']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
df['Z_high'] = (df['high']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
df['Z_low'] = (df['low']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
df['Z_vol'] = (df['vol']-df['vol'].rolling(252).mean())/df['vol'].rolling(252).std()
# 试几个Alpha101里的因子
df['Alpha101'] = (df['Z_close']-df['Z_open'])/((df['Z_high']-df['Z_low'])+0.001)
df['Alpha028'] = (df['Z_high']+df['Z_low'])/2 - df['Z_close']
df['Alpha022'] = df['high'].rolling(5).corr(df['vol'])
# 20天和60天均线
df['MA20'] = df['close'].rolling(20).mean()
df['MA60'] = df['close'].rolling(60).mean()
df['MA20'] = (df['MA20']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
df['MA60'] = (df['MA60']-df['close'].rolling(252).mean())/df['close'].rolling(252).std()
# 技术指标：RSI和MACD
df['RSI6'] = rsi(df.close,period=6)
df['RSI24'] = rsi(df.close,period=24)
df['RSI6'] /= 100
df['RSI24'] /= 100
# 计算MACD
df = cal_macd_system(df,12,26,9)

In [7]:
df_train = df.loc['20070101':'20161230',:]
df_test = df.loc['20170103':'20220630',:]

In [8]:
df_train[:20]

Unnamed: 0_level_0,ts_code,trade_date,close,open,high,low,vol,Z_close,Z_open,Z_high,...,Alpha101,Alpha028,Alpha022,MA20,MA60,RSI6,RSI24,diff,dea,macd
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-04,000001.SH,20070104,2715.719,2728.188,2847.615,2684.818,120156021.0,3.43997,3.478942,3.852213,...,-0.076442,0.157831,0.962254,2.280233,1.352449,1.0,0.877318,0.098204,0.038712,0.118985
2007-01-05,000001.SH,20070105,2641.334,2668.577,2685.804,2617.019,106155625.0,3.139624,3.22346,3.276474,...,-0.394198,0.031012,0.918318,2.298682,1.357104,0.790076,0.807093,0.094938,0.049957,0.089962
2007-01-08,000001.SH,20070108,2707.199,2621.068,2708.444,2620.625,106813244.0,3.266033,3.005531,3.269798,...,0.9771,-0.129038,0.847758,2.323596,1.361162,0.802292,0.811174,0.101381,0.060241,0.082278
2007-01-09,000001.SH,20070109,2807.804,2711.049,2809.394,2691.36,110751267.0,3.478914,3.192154,3.483626,...,0.817385,-0.1702,0.728457,2.353705,1.364973,0.822972,0.828367,0.122255,0.072644,0.099222
2007-01-10,000001.SH,20070110,2825.576,2838.113,2841.741,2770.988,111769365.0,3.442539,3.478964,3.489504,...,-0.176336,-0.055817,0.829672,2.394186,1.369004,0.817124,0.821416,0.134315,0.084978,0.098673
2007-01-11,000001.SH,20070111,2770.11,2819.367,2841.18,2763.886,121598717.0,3.208129,3.348849,3.411165,...,-0.634395,0.092627,0.783457,2.419975,1.37526,0.633539,0.777168,0.123533,0.092689,0.061688
2007-01-12,000001.SH,20070112,2668.11,2745.321,2782.025,2652.578,107303768.0,2.862928,3.080718,3.184249,...,-0.594839,0.138755,0.704986,2.435801,1.38346,0.44279,0.720104,0.086141,0.09138,-0.010477
2007-01-15,000001.SH,20070115,2794.701,2660.07,2795.331,2658.879,91761561.0,3.149964,2.776339,3.151713,...,0.984056,-0.187591,0.674996,2.457684,1.389664,0.663749,0.747573,0.078761,0.088856,-0.02019
2007-01-16,000001.SH,20070116,2821.017,2818.663,2830.803,2757.205,111178574.0,3.152615,3.146188,3.179332,...,0.031826,-0.073748,0.696546,2.477803,1.395874,0.632732,0.790576,0.072293,0.085543,-0.026501
2007-01-17,000001.SH,20070117,2778.9,2828.401,2870.422,2742.588,127377480.0,2.977135,3.110351,3.223436,...,-0.386106,0.07429,0.839518,2.49324,1.403126,0.460968,0.747915,0.052403,0.078915,-0.053025


In [9]:
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')