In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler 

# 基本数据预处理

In [64]:
df1=pd.read_csv('data/factor1.csv',header=None)
df2=pd.read_csv('data/factor2.csv',header=None)

In [65]:
df1_yield=pd.read_csv('data/monthyield1.csv')
df2_yield=pd.read_csv('data/monthyield2.csv')

In [66]:
title = pd.read_csv('data/title.csv')

In [67]:
df1.columns=title.columns

In [68]:
df2.columns=title.columns

In [69]:
df=pd.concat([df1,df2])

In [70]:
df.reset_index(inplace=True)

In [71]:
df.drop(columns=['index'],inplace=True)

In [72]:
df_yield=pd.concat([df1_yield,df2_yield])

In [73]:
df_yield.columns=np.array(['wind_code','trade_dt','s_mq_pctchange','s_mq_amount'])

In [74]:
df_yield.reset_index(inplace=True)

In [75]:
df_yield.drop(columns=['index'],inplace=True)

In [76]:
df['trade_dt']=pd.to_datetime((df['trade_dt']).apply(str))
df_yield['trade_dt']=pd.to_datetime((df_yield['trade_dt']).apply(str))

In [77]:
df.head()

Unnamed: 0,trade_dt,wind_code,ep_ttm,bp_lyr,sp_ttm,cfp_ncf_ttm,cfp_ocf_ttm,cfp_fcff_ttm,ortoev_ttm,peg,...,beta_60D_zz500,beta_120D_zz500,beta_240D_zz500,beta_480D_zz500,beta_20D_szzz,beta_60D_szzz,beta_120D_szzz,beta_240D_szzz,beta_480D_szzz,updatetime
0,2015-01-05,000001.SZ,0.105061,0.692445,0.379698,0.078568,-0.077901,,0.349975,32.393068,...,0.533999,0.561988,0.587762,0.916,1.528496,1.531845,1.514987,1.462726,1.724338,2019-05-23 01:46:00.000
1,2015-01-05,000002.SZ,0.093869,0.478156,0.822869,0.072315,0.189907,0.038477,0.780464,81.357305,...,0.738721,0.651407,0.568098,0.728017,1.40601,1.385631,1.338188,1.243581,1.312524,2019-05-23 01:46:00.000
2,2015-01-05,000003.SZ,,,,,,,,,...,,,,,,,,,,2019-05-23 01:46:00.000
3,2015-01-05,000004.SZ,-0.005683,0.053775,0.051532,-0.011803,0.001882,-0.008055,0.052032,53.969832,...,1.543587,1.117074,0.997353,0.904946,0.507822,0.457289,0.332883,0.522529,0.640062,2019-05-23 01:46:00.000
4,2015-01-05,000005.SZ,-0.012082,0.158174,0.014907,0.000598,-0.030551,0.086292,0.015112,10.260072,...,1.006348,0.807401,0.783661,0.822129,0.265381,0.428369,0.403558,0.67219,0.881936,2019-05-23 01:46:00.000


In [78]:
df_yield.head()

Unnamed: 0,wind_code,trade_dt,s_mq_pctchange,s_mq_amount
0,000725.SZ,2015-01-30,-7.4475,2014854.0
1,600303.SH,2015-01-30,-6.7475,149342.1
2,600306.SH,2015-01-30,8.1065,33036.35
3,600339.SH,2015-01-30,4.6778,108448.6
4,600346.SH,2015-01-30,17.9652,91792.13


# 特征与标签提取

### 1.保留每月最后一个交易日的因子数据

In [79]:
last_day_in_month=df.groupby([df['trade_dt'].apply(lambda x: x.year), df['trade_dt'].apply(lambda x: x.month)]).last()['trade_dt']

In [80]:
new_df = df[df.trade_dt.isin(last_day_in_month)]

### 2.打标签

In [81]:
month_pc=df_yield.groupby('trade_dt')['s_mq_pctchange']

In [82]:
quantile_0 = month_pc.quantile(q=0.3)

In [83]:
quantile_1 = month_pc.quantile(q=0.7)

In [84]:
df_yield['label'] = df_yield.apply(lambda x: 2 if x['s_mq_pctchange'] <= quantile_0[x['trade_dt']]
                                             else (1 if x['s_mq_pctchange'] <= quantile_1[x['trade_dt']]
                                                  else 0),axis=1
                                            )

In [85]:
time_dict=dict(zip(quantile_0.index[1:],quantile_0.index))

In [86]:
time_dict[quantile_0.index[0]]=None

In [87]:
df_yield['label_time'] = df_yield['trade_dt'].apply(lambda x: time_dict[x])
df_yield = df_yield[~df_yield['label_time'].isnull()]

In [88]:
df_with_label = pd.merge(new_df, df_yield, how='inner', left_on=['wind_code', 'trade_dt'],
                        right_on=['wind_code', 'label_time'])

In [89]:
df_with_label.head()

Unnamed: 0,trade_dt_x,wind_code,ep_ttm,bp_lyr,sp_ttm,cfp_ncf_ttm,cfp_ocf_ttm,cfp_fcff_ttm,ortoev_ttm,peg,...,beta_60D_szzz,beta_120D_szzz,beta_240D_szzz,beta_480D_szzz,updatetime,trade_dt_y,s_mq_pctchange,s_mq_amount,label,label_time
0,2015-01-30,000001.SZ,0.120824,0.796337,0.436667,0.090356,-0.089589,,0.397811,28.167006,...,1.357957,1.313219,1.339941,1.574382,2019-05-23 01:48:00.000,2015-02-27,0.4304,1804355.0,2,2015-01-30
1,2015-01-30,000002.SZ,0.106428,0.542133,0.932968,0.081991,0.215316,0.043625,0.878831,71.756337,...,1.330106,1.297934,1.226921,1.308739,2019-05-23 01:48:00.000,2015-02-27,-2.8202,3287976.0,2,2015-01-30
2,2015-01-30,000004.SZ,-0.00514,0.04863,0.046602,-0.010674,0.001702,-0.007284,0.04701,59.679834,...,0.367,0.331869,0.453777,0.558445,2019-05-23 01:48:00.000,2015-02-27,7.8996,41363.55,1,2015-01-30
3,2015-01-30,000005.SZ,-0.012082,0.158174,0.014907,0.000598,-0.030551,0.086292,0.015112,10.260072,...,0.311863,0.295469,0.554438,0.811105,2019-05-23 01:48:00.000,2015-02-27,0.0,0.0,2,2015-01-30
4,2015-01-30,000006.SZ,0.065695,0.446514,0.415537,-0.023423,-0.107033,0.079987,0.376103,-163.776036,...,0.743068,0.790482,0.879827,1.056388,2019-05-23 01:48:00.000,2015-02-27,1.0795,142236.9,2,2015-01-30


### 3.删除特征中非因子项

In [90]:
raw_data=df_with_label.drop(columns=['trade_dt_x','wind_code','updatetime','trade_dt_y','s_mq_pctchange','s_mq_amount'])
#保留label_time作为训练集和测试集的划分依据

### 4. 删除因子之间相关性过高的项

In [91]:
correlation=raw_data.corr()

In [92]:
corr_map=correlation.where((correlation>0.8) | (correlation < -0.8))

In [93]:
corr_li=[]

In [94]:
for i in correlation.index:
    for j in correlation.columns:
        if ~np.isnan(corr_map[i][j]) and corr_map[i][j]!=1 and (j,i) not in corr_li:
            corr_li.append((i,j))

In [95]:
degree_dict=dict()
for i in corr_li:
    for a in i:
        degree_dict[a]=degree_dict.get(a,0)+1

In [96]:
record_factor=[]

In [97]:
def find_max_key(degree_dict):
    return list(degree_dict.keys())[list(degree_dict.values()).index((max(degree_dict.values())))]

In [98]:
sum(degree_dict.values())

246

In [99]:
def delete_factor():
    while sum(degree_dict.values())!=0:
        max_key=find_max_key(degree_dict)
        degree_dict.pop(max_key)
        record_factor.append(max_key)
        temp_li=[]
        for index,i in enumerate(corr_li):
            if max_key in i:
                temp_li.append(i)
                for a in i:
                    if a!=max_key:
                        degree_dict[a]=degree_dict.get(a)-1
        for i in temp_li:
            corr_li.remove(i)

In [100]:
delete_factor()

In [101]:
record_factor

['ivol_capm_240D_szzz',
 'beta_120D_zz500',
 'finaexpensetogr_ttm2',
 'vol_60D',
 'beta_240D_zz500',
 'threecosttosales_ttm',
 'ivol_capm_60D_szzz',
 'ivol_capm_240D_zz500',
 'beta_60D_szzz',
 'beta_120D_szzz',
 'debttoequity',
 'operateprofitmargin_ttm',
 'mom_tw_60D',
 'ivol_capm_20D_zz500',
 'vol_120D',
 'beta_480D_zz500',
 'roa_avg_ttm',
 'netprofitmargin_ttm',
 'operateexpensetogr_ttm2',
 'net_profit_yoy_ttm',
 'net_profit_qoq_ttm',
 'net_profit_3y_ttm',
 'mom_180D',
 'mom_tw_120D',
 'turnover_20D',
 'ivol_capm_60D_zz500',
 'beta_20D_zz500',
 'roe_avg',
 'roe_diluted',
 'roa_diluted_ttm',
 'grossprofitmargin_qoq_ttm',
 'nonoperateprofittoebt_ttm',
 'eps_ttm',
 'current',
 'oper_profit_5y_ttm',
 'net_profit_yoy',
 'net_profit_qoq',
 'roe_yoy_ttm',
 'roe_qoq_ttm',
 'roe_3y_ttm',
 'grossprofitmargin_yoy',
 'roe_5y_ttm',
 'mom_240D',
 'turnover_60D',
 'turnbias_20Dto60D',
 'vol_20D',
 'ivol_capm_120D_szzz',
 'volbias_20Dto60D',
 'ivol_capm_480D_zz500',
 'dif',
 's_val_mv',
 'ln_mv',
 

In [102]:
raw_data.drop(columns=record_factor,inplace=True)

In [103]:
raw_data.shape

(185390, 123)

### 5.训练集与测试集的划分

In [104]:
import datetime

In [105]:
training_data=raw_data[raw_data['label_time']<datetime.datetime(2019,1,1)]

In [106]:
training_data.drop(columns=['label_time'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [107]:
test_data=raw_data[raw_data['label_time']>=datetime.datetime(2019,1,1)]

In [108]:
test_data.drop(columns=['label_time'],inplace=True)

In [109]:
training_data_x=training_data.loc[:,training_data.columns!='label']

In [110]:
training_data_y=training_data['label']

In [111]:
test_data_x=test_data.loc[:,test_data.columns!='label']

In [112]:
test_data_y=test_data['label']

####   a. 中位数去极值

In [113]:
record_median=dict()

In [114]:
def median_recompute(x):
    D_m=x.median()
#     print(D_m)
    new_series=abs(x-D_m)
    D_m1=new_series.median()
    x = x.apply(lambda y: D_m+5*D_m1 if y>D_m+5*D_m1 else (D_m-5*D_m1 if y< D_m-5*D_m1 else y) )
    record_median[x.name]=[D_m+5*D_m1, D_m-5*D_m1]

In [115]:
training_data_x.apply(median_recompute,axis=0)

ep_ttm                   None
bp_lyr                   None
sp_ttm                   None
cfp_ncf_ttm              None
cfp_ocf_ttm              None
cfp_fcff_ttm             None
ortoev_ttm               None
peg                      None
roe_avg_ttm              None
roe_diluted_ttm          None
roa_avg                  None
roa_diluted              None
roic_wind                None
roic_wind_ttm            None
berryratio_ttm           None
grossmargin_ttm          None
adminexpensetogr_ttm2    None
taxtoebt_ttm             None
taxtoor_ttm              None
eps_diluted_is           None
bps_ttm                  None
orps_ttm                 None
dps                      None
endogenousgrowth         None
scftosales_ttm           None
wgsdtosales_ttm          None
wgsdtoor_ttm             None
debttoassets             None
curassetsratio           None
fixedassettoasset        None
                         ... 
mom_480D                 None
mom_1200D                None
mom_tw_20D

In [116]:
def test_median_recompute(x):
    x=x.apply(lambda y: record_median[x.name][0] if y>record_median[x.name][0] else (record_median[x.name][1] if y<record_median[x.name][1] else y))

In [117]:
test_data_x.apply(test_median_recompute,axis=0)

ep_ttm                   None
bp_lyr                   None
sp_ttm                   None
cfp_ncf_ttm              None
cfp_ocf_ttm              None
cfp_fcff_ttm             None
ortoev_ttm               None
peg                      None
roe_avg_ttm              None
roe_diluted_ttm          None
roa_avg                  None
roa_diluted              None
roic_wind                None
roic_wind_ttm            None
berryratio_ttm           None
grossmargin_ttm          None
adminexpensetogr_ttm2    None
taxtoebt_ttm             None
taxtoor_ttm              None
eps_diluted_is           None
bps_ttm                  None
orps_ttm                 None
dps                      None
endogenousgrowth         None
scftosales_ttm           None
wgsdtosales_ttm          None
wgsdtoor_ttm             None
debttoassets             None
curassetsratio           None
fixedassettoasset        None
                         ... 
mom_480D                 None
mom_1200D                None
mom_tw_20D

###   b. 缺失值处理

In [118]:
imputation_transformer = SimpleImputer(np.nan, "mean")
imputation_transformer.fit(training_data_x)
X_train_imputed=imputation_transformer.transform(training_data_x)
X_test_imputed=imputation_transformer.transform(test_data_x)

### c.标准化

In [119]:
scaler = StandardScaler()
scaler.fit(X_train_imputed)
X_train = scaler.transform(X_train_imputed)
X_test = scaler.transform(X_test_imputed)

In [120]:
y_train=training_data_y.values
y_test=test_data_y.values

In [121]:
np.savetxt('data/X_train.csv',X_train,delimiter=',')

In [122]:
np.savetxt('data/X_test.csv',X_test,delimiter=',')

In [123]:
np.savetxt('data/y_train.csv',y_train,delimiter=',')

In [124]:
np.savetxt('data/y_test.csv',y_test,delimiter=',')