In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib as mpl
%matplotlib inline
mpl.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

import time
import datetime as dt

from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,\
                              GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import math
import statistics



In [2]:
from data_processing import *

In [3]:
data = pd.read_csv("tsharep.csv",encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
Taiwan50 = extract_target_index('ETF50.xlsx',data)
TXF = load_txt_TXF('TXF1-300-分鐘.txt',Taiwan50)
EXF = load_txt_2('EXF1-1-日.txt',Taiwan50)
FXF = load_txt_2('FXF1-1-日.txt',Taiwan50)
E_F = EXF/FXF
Nikkei = load_csv('Nikkei225.csv',Taiwan50)
VIX = load_csv('VIX.csv',Taiwan50).drop(['volume'],axis = 1)
Russell = load_csv('Russell2000.csv',Taiwan50)
SP = load_csv('S&P500.csv',Taiwan50)
DJ = load_csv('Dow Jones.csv',Taiwan50)
SOX = load_csv('SOX.csv',Taiwan50).drop(['volume'],axis = 1)
pcr = load_txt('put_call_ratio-日-成交價.txt',Taiwan50).drop(['volume','open','low'],axis = 1)
FOI = load_txt('外資未平倉-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
IOI = load_txt('投信未平倉-日-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
DOI = load_txt('自營未平倉-日-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
orderbook_ = pd.read_hdf('twse_orderbook_comp.h5').rename(columns = {'時間':'date',
                                                                    '累積委託買進筆數':'ask_vol',
                                                                    '累積委託買進數量':'ask_count',
                                                                    '累積委託賣出筆數':'bid_vol',
                                                                    '累積委託賣出數量':'bid_count',
                                                                    '累積成交筆數':'vol',
                                                                    '累積成交數量':'count',
                                                                    '累積成交金額':'amount'})
orderbook = orderbook_[orderbook_.index==0].iloc[-1*Taiwan50.shape[0]-1:-1]
orderbook['date'] = Taiwan50.index.values
orderbook = orderbook.groupby('date').sum().drop(['vol','count','amount'],axis = 1)
ETFtable = ETF_data_processing('tetfp.csv')
Yunta50 = extract_target_ETF(ETFtable,50)
Yunta50_rate = get_change_rate(Yunta50)
oil_price = pd.read_csv('oil_price.csv')
oil_price['date'] = oil_price['date'].map(lambda x:dt.datetime.strptime(str(x),'%Y/%m/%d'))
oil_price = oil_price.set_index('date')

fiveyear_bond = pd.read_excel('five_year_bond.xls').rename(columns = {'observation_date':'date'}).set_index('date')
tenyear_bond = pd.read_excel('ten_year_bond.xls').rename(columns = {'observation_date':'date'}).set_index('date')
bond = pd.concat([fiveyear_bond,tenyear_bond],axis = 1).loc[Taiwan50.index[0]:Taiwan50.index[-1]].rename(columns = {'DGS5':'DGSfive'
                                                                                                                    ,'DGS10':'DGSten'})
mrf = pd.Series(get_mean_rf(Yunta50,'close',20,5),index=Yunta50.index)

In [4]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]

In [5]:
models  =  {
    'DecisionTreeClassifier' : DecisionTreeClassifier(random_state = 0),
         'RandomForestClassifier': RandomForestClassifier(random_state = 0),
#     'ExtraTreesClassifier': ExtraTreesClassifier(random_state = 0),
#     'AdaBoostClassifier': AdaBoostClassifier(base_estimator = DecisionTreeClassifier(),\
#                                              n_estimators = 10,random_state = 0),
#     'GradientBoostingClassifier': GradientBoostingClassifier(random_state = 0),
#     'SVC': SVC(probability=True,random_state = 0),
}
    
model_grid_params = {
        'DecisionTreeClassifier':{'max_depth': [None,1,5,10],'min_samples_leaf': [1,2,5,10]},
    'RandomForestClassifier': {'max_features':[None],'n_estimators':[5,10],'max_depth':[2,10],\
                               'min_samples_split':[2],'criterion':['entropy'],\
                               'min_samples_leaf':[3]},
#     'ExtraTreesClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
#                              'min_samples_split':[2],'criterion':['entropy'],\
#                              'min_samples_leaf':[3]},
#     'AdaBoostClassifier': {"base_estimator__criterion" : ["entropy"],\
#                            "base_estimator__max_depth": [None],\
#                            "base_estimator__min_samples_leaf" : [3],\
#                            "base_estimator__min_samples_split" : [2],\
#                            "base_estimator__max_features" : [None]},
#     'GradientBoostingClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
#                                    'min_samples_split':[2],'min_samples_leaf':[3],\
#                                    'learning_rate':[0.1],'subsample':[1.0]},
#     'SVC': [{'kernel':['rbf'],'gamma':[1e-1],'C':[1]},\
#             {'kernel':['linear'],'C':[1,10]}]
}

In [6]:
class Model_Selection:
    
    def __init__(self,models,model_grid_params,stock_5,latest_day,pred_day,day):
        
        self.models = models
        self.model_grid = model_grid_params
        self.stock_5 = stock_5
        self.latest_day= latest_day
        self.pred_day = pred_day
        self.day = day
        self.keys = models.keys()
        self.best_score = {}
        self.grid = {}
        
        self.predict_values = {}
        self.cv_acc = {}
        self.acc = {}
        self.fscore = {}
        self.true_values = {}
        
        self.predict_values_day = {}
        self.cv_acc_day = {}
        self.acc_day = {}
        self.fscore_day = {}
        self.true_values_day = {}
        self.summary_day = []
        
    def Grid_fit(self,X_train,y_train,cv = 5,scoring = 'accuracy'):
        
        for key in self.keys:
            print ("Running GridSearchCV for %s" %(key))
            model = self.models[key]
            model_grid = self.model_grid[key]
            Grid = GridSearchCV(model, model_grid, cv = cv, scoring = scoring)
            Grid.fit(X_train,y_train) 
            self.grid[key] = Grid
            print (Grid.best_params_)
            print ('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc[key].append(Grid.best_score_)  
    
    def model_fit(self,X_train, y_train, X_test, y_test):
        
        for key in self.keys:
            print ("Running training & testing for %s." %(key))
            model = self.models[key]
            model.set_params(**self.grid[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            #print 'Prediction latest 15 second = %s'%(predictions)
            self.predict_values[key].append(predictions.tolist())
            self.true_values[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test,predictions)
            f_score = metrics.f1_score(y_test,predictions)
            print ('Accuracy = %s'%(acc))
            self.acc[key].append(acc)
            self.fscore[key].append(f_score)


    def pipline(self):
        
        self.set_list_day() # store day values
        for day in range(0,self.day,1):
            self.set_list() # store values
            print ('Day = %s'%(day+1))
            for i in range(0,500,self.pred_day):#9000-self.latest_day-600,self.pred_day):
                
                print ('--------------------Rolling Window Time = %s--------------------'%(i/pred_day))
                # Train data
                data_train = self.stock_5[day][i:i+self.latest_day]
                X_train = data_train[:,:-3]
                train_rise = data_train[:,-3]
                train_fall = data_train[:,-2]
                train_noise = data_train[:,-1]
                y_train = train_rise

                # Test data
                data_test = self.stock_5[day][i + self.latest_day:i + self.latest_day + self.pred_day]
                X_test = data_test[:,:-3]
                test_rise = data_test[:,-3]
                test_fall = data_test[:,-2]
                test_noise = data_test[:,-1]
                y_test = test_rise
                
                #start = time.time()
                self.Grid_fit(X_train, y_train, cv = 2, scoring = 'accuracy')
                self.model_fit(X_train, y_train,X_test,y_test)
                #end = time.time()
                #print 'Total Time = %s'%(end - start)
                
            for key in self.keys:
                
                self.cv_acc_day[key].append(self.cv_acc[key])
                self.acc_day[key].append(self.acc[key])
                self.fscore_day[key].append(self.fscore[key])
                self.true_values_day[key].append(self.true_values[key])
                self.predict_values_day[key].append(self.predict_values[key])
            
            self.summary_day.append(self.score_summary(sort_by = 'Accuracy_mean'))
    
    def set_list(self):
        
        for key in self.keys:
            self.predict_values[key] = []
            self.cv_acc[key] = []
            self.acc[key] = []
            self.fscore[key] = []
            self.true_values[key] = []
            
    def set_list_day(self):
        
        for key in self.keys:
            self.predict_values_day[key] = []
            self.cv_acc_day[key] = []
            self.acc_day[key] = []
            self.fscore_day[key] = []
            self.true_values_day[key] = []
            
    def score_summary(self,sort_by):
        
        summary = pd.concat([pd.Series(list(pip.acc.keys())),pd.Series(map(lambda x: sum(pip.acc[x])/len(pip.acc[x]), pip.acc)),\
                             pd.Series(list(map(lambda x: statistics.stdev(pip.acc[x]), pip.acc))),\
                             pd.Series(list(map(lambda x: max(pip.acc[x]), pip.acc))),\
                             pd.Series(list(map(lambda x: min(pip.acc[x]), pip.acc))),\
                             pd.Series(list(map(lambda x: sum(pip.fscore[x])/len(pip.fscore[x]), pip.fscore)))],axis=1)
        summary.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary.index.rename('Ranking', inplace=True)
        return summary.sort_values(by = [sort_by], ascending=False)
          
    def print_(self):

        print (self.predict_values)

In [7]:
def get_dataframe(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    #create null columns for the null data
    df_null = df_all.copy()
    for columnname in df_all.columns:
        df_null[columnname+'isnull'] = list(map(lambda x: int(x),df_all.isna()[columnname]))
    df_null.drop(df_all.columns,axis = 1,inplace = True)

    df_total = pd.concat([df_all,df_null],axis = 1)
    df_total = pd.concat([df_total,df_rf.loc[df_total.index]],axis = 1).loc[TXF.index[25]:TXF.index[-1],:].fillna(0)
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total.index]['trf']

    return [df_total ,trf]


In [8]:
def get_dataframe_feature(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    #create null columns for the null data
    df_null = df_all.copy()
    for columnname in df_all.columns:
        df_null[columnname+'isnull'] = list(map(lambda x: int(x),df_all.isna()[columnname]))
    df_null.drop(df_all.columns,axis = 1,inplace = True)

    df_total = pd.concat([df_all,df_null],axis = 1)
    df_total = pd.concat([df_total,df_rf.loc[df_total.index]],axis = 1).loc[TXF.index[25]:TXF.index[-1],:].fillna(0)
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total.index]['trf']

    data = df_total.as_matrix()
    result = int(data.shape[0]*0.9)
    train_X = data[:result,:-3]
    train_rise = data[:result,-3]
    train_fall = data[:result,-2]
    train_noise = data[:result,-1]

    test_X = data[result:,:-3]
    test_rise = data[result:,-3]
    test_fall = data[result:,-2]
    test_noise = data[result:,-1]
    trf_v = trf.values[result:]

    return [df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf_v]


In [9]:
def get_dataframe_rf(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    df_total = df_all.dropna(axis = 0,how = 'any')
    
    #split rise and fall
    dicu = {}
    dicd = {}
    for index, col in enumerate(df_total.columns[:-1]):
        dicu[col] = col+'_r'
        dicd[col] = col+'_f'

    up = df_total[df_total['mean_rise_fall']==1].rename(columns = dicu).iloc[:,:-1]
    dn = df_total[df_total['mean_rise_fall']==-1].rename(columns = dicd).iloc[:,:-1]
    df_total_ud = pd.concat([up,dn],axis = 1).sort_index().fillna(0)
    
    df_total_ud = pd.concat([df_total_ud,df_rf.loc[df_total.index]],axis = 1).dropna(axis = 0,how = 'any')
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total.index]['trf']
    

    return [df_total_ud,trf_v]

In [10]:
def get_dataframe_rf_feature(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    df_total = df_all.dropna(axis = 0,how = 'any')
    
    #split rise and fall
    dicu = {}
    dicd = {}
    for index, col in enumerate(df_total.columns[:-1]):
        dicu[col] = col+'_r'
        dicd[col] = col+'_f'

    up = df_total[df_total['mean_rise_fall']==1].rename(columns = dicu).iloc[:,:-1]
    dn = df_total[df_total['mean_rise_fall']==-1].rename(columns = dicd).iloc[:,:-1]
    df_total_ud = pd.concat([up,dn],axis = 1).sort_index().fillna(0)
    
    df_total_ud = pd.concat([df_total_ud,df_rf.loc[df_total.index]],axis = 1).dropna(axis = 0,how = 'any')
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total_ud.index]['trf']
    
    data = df_total_ud.as_matrix()
    result = int(data.shape[0]*0.9)
    train_X = data[:result,:-3]
    train_rise = data[:result,-3]
    train_fall = data[:result,-2]
    train_noise = data[:result,-1]

    test_X = data[result:,:-3]
    test_rise = data[result:,-3]
    test_fall = data[result:,-2]
    test_noise = data[result:,-1]
    trf_v = trf.values[result:]

    return [df_total_ud,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf_v]

In [11]:
def show_summary(summary):
    dic = {}
    df = pd.concat(pip.summary_day,axis = 0,ignore_index = True)
    for index,i in enumerate(df.index):
        dic[i] = intervals[index//2]
    df_rn = df.rename(index = dic)
    df_rn.index.name = 'interval'
    return df_rn


In [12]:
rf_params_test = {
#     'n_jobs': -1,
    'n_estimators': [1],
#      'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': [None,1,5],
    'min_samples_leaf': [1,2,5],
#     'max_features' : 'sqrt',
#     'verbose': 0
}

In [13]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']


# Observation of the rf data 
### single r or f

In [209]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']

intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1

best_score = []
best_params = []
for interval in intervals:
    print('interval is',interval)
    df_total ,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf_v = get_dataframe_rf_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 4)
    amount_of_features = len(df_total.columns)-3
    score = []
    params = []
    for index in range(amount_of_features):
        print('feature',index)
        train_X_f = train_X[:,index].reshape((-1,1))
        test_X_f = test_X[:,index].reshape((-1,1))
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)

    best_score.append(score)
    best_params.append(params)


interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
feature 129
feature 130
feature 131
feature 132
feature 133
feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
fea

feature 128
feature 129
feature 130
feature 131
feature 132
feature 133
feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
feature 159
feature 160
feature 161
feature 162
feature 163
feature 164
feature 165
feature 166
feature 167
feature 168
feature 169
feature 170
feature 171
feature 172
feature 173
feature 174
feature 175
feature 176
feature 177
feature 178
feature 179
feature 180
feature 181
feature 182
feature 183
feature 184
feature 185
feature 186
feature 187
feature 188
feature 189
feature 190
feature 191
feature 192
feature 193
feature 194
feature 195
feature 196
feature 197
feature 198
feature 199
feature 200
feature 201
feature 202
feature 203
feature 204
feature 205
feature 206
feature 207
feature 208
feature 209
feature 210
feat

feature 187
feature 188
feature 189
feature 190
feature 191
feature 192
feature 193
feature 194
feature 195
feature 196
feature 197
feature 198
feature 199
feature 200
feature 201
feature 202
feature 203
feature 204
feature 205
feature 206
feature 207
feature 208
feature 209
feature 210
feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feature 217
feature 218
feature 219
feature 220
feature 221
feature 222
feature 223
feature 224
feature 225
feature 226
feature 227
feature 228
feature 229
feature 230
feature 231
feature 232
feature 233
feature 234
feature 235
feature 236
feature 237
feature 238
feature 239
feature 240
feature 241
feature 242
feature 243
feature 244
feature 245
feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feat

feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feature 270
feature 271
feature 272
feature 273
feature 274
feature 275
feature 276
feature 277
feature 278
feature 279
feature 280
feature 281
feature 282
feature 283
feature 284
feature 285
feature 286
feature 287
feature 288
feature 289
feature 290
feature 291
feature 292
feature 293
feature 294
feature 295
feature 296
feature 297
feature 298
feature 299
feature 300
feature 301
feature 302
feature 303
feature 304
feature 305
feature 306
feature 307
feature 308
feature 309
feature 310
feature 311
feature 312
feature 313
feature 314
feature 315
feature 316
feature 317
feature 318
feature 319
feature 320
feature 321
feature 322
feature 323
feature 324
feature 325
feature 326
feature 327
feature 328
feat

feature 306
feature 307
feature 308
feature 309
feature 310
feature 311
feature 312
feature 313
feature 314
feature 315
feature 316
feature 317
feature 318
feature 319
feature 320
feature 321
feature 322
feature 323
feature 324
feature 325
feature 326
feature 327
feature 328
feature 329
feature 330
feature 331
feature 332
feature 333
feature 334
feature 335
feature 336
feature 337
feature 338
feature 339
feature 340
feature 341
feature 342
feature 343
feature 344
feature 345
feature 346
feature 347
feature 348
feature 349
feature 350
feature 351
feature 352
feature 353
feature 354
feature 355
feature 356
feature 357
feature 358
feature 359
feature 360
feature 361
feature 362
feature 363
feature 364
feature 365
feature 366
feature 367
feature 368
feature 369
feature 370
feature 371
feature 372
feature 373
feature 374
feature 375
feature 376
feature 377
feature 378
feature 379
feature 380
feature 381
feature 382
feature 383
feature 384
feature 385
feature 386
feature 387
feature 388
feat

feature 365
feature 366
feature 367
feature 368
feature 369
feature 370
feature 371
feature 372
feature 373
feature 374
feature 375
feature 376
feature 377
feature 378
feature 379
feature 380
feature 381
feature 382
feature 383
feature 384
feature 385
feature 386
feature 387
feature 388
feature 389
feature 390
feature 391
feature 392
feature 393
feature 394
feature 395
feature 396
feature 397
feature 398
feature 399
feature 400
feature 401
feature 402
feature 403
feature 404
feature 405
feature 406
feature 407
feature 408
feature 409
feature 410
feature 411
feature 412
feature 413
feature 414
feature 415
feature 416
feature 417
feature 418
feature 419
feature 420
feature 421
feature 422
feature 423
feature 424
feature 425
feature 426
feature 427
feature 428
feature 429
feature 430
feature 431
feature 432
feature 433
feature 434
feature 435
feature 436
feature 437
feature 438
feature 439
feature 440
feature 441
feature 442
feature 443
feature 444
feature 445
feature 446
feature 447
feat

In [210]:
df_srf = pd.DataFrame(best_score,columns=df_total.columns[:-3],index = intervals)
df_srf

Unnamed: 0,Yunta50_open_1_1.0_r,Yunta50_open_1_2.0_r,Yunta50_open_1_3.0_r,Yunta50_open_1_4.0_r,Yunta50_high_1_1.0_r,Yunta50_high_1_2.0_r,Yunta50_high_1_3.0_r,Yunta50_high_1_4.0_r,Yunta50_low_1_1.0_r,Yunta50_low_1_2.0_r,...,DOI_high_1_3.0_f,DOI_high_1_4.0_f,DOI_low_1_1.0_f,DOI_low_1_2.0_f,DOI_low_1_3.0_f,DOI_low_1_4.0_f,DOI_close_1_1.0_f,DOI_close_1_2.0_f,DOI_close_1_3.0_f,DOI_close_1_4.0_f
0.0,0.505233,0.502379,0.500476,0.491912,0.511893,0.50333,0.505233,0.505233,0.500476,0.506185,...,0.50333,0.497621,0.498573,0.505233,0.50333,0.497621,0.498573,0.505233,0.50333,0.497621
0.1,0.543292,0.530923,0.543292,0.543292,0.543292,0.543292,0.543292,0.543292,0.532826,0.543292,...,0.534729,0.529971,0.53568,0.541389,0.534729,0.529971,0.53568,0.541389,0.534729,0.529971
0.25,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,...,0.607992,0.589914,0.607992,0.603235,0.607992,0.589914,0.607992,0.603235,0.607992,0.589914
0.5,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,...,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878
0.75,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,...,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336
1.0,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,...,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133
1.5,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,...,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814


In [211]:
df_srf.iloc[0,:][df_srf.iloc[0,:]>0.6]

Series([], Name: 0.0, dtype: float64)

## Combine r and f

In [212]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']

intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1

best_score = []
best_params = []
for interval in intervals:
    print('interval is',interval)
    df_total ,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf_v = get_dataframe_rf_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 4)
    amount_of_features = len(df_total.columns)-3
    half = int(amount_of_features/2)
    score = []
    params = []
    for index in range(half):
        print('feature',index)
        train_X_f = train_X[:,[index,index+half]]
#         test_X_f = test_X[:,index].reshape((-1,1))
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)

    best_score.append(score)
    best_params.append(params)


interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
feature 129
feature 130
feature 131
feature 132
feature 133
feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
featur

feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
feature 159
feature 160
feature 161
feature 162
feature 163
feature 164
feature 165
feature 166
feature 167
feature 168
feature 169
feature 170
feature 171
feature 172
feature 173
feature 174
feature 175
feature 176
feature 177
feature 178
feature 179
feature 180
feature 181
feature 182
feature 183
feature 184
feature 185
feature 186
feature 187
feature 188
feature 189
feature 190
feature 191
feature 192
feature 193
feature 194
feature 195
feature 196
feature 197
feature 198
feature 199
feature 200
feature 201
feature 202
feature 203
feature 204
feature 205
feature 206
feature 207
feature 208
feature 209
feature 210
feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feature 217
feature 218
feature 219
feature 220
feature 221
feature 222
feature 223
feature 224
feature 225
feature 226
feat

feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feature 217
feature 218
feature 219
feature 220
feature 221
feature 222
feature 223
feature 224
feature 225
feature 226
feature 227
feature 228
feature 229
feature 230
feature 231
feature 232
feature 233
feature 234
feature 235
feature 236
feature 237
feature 238
feature 239
feature 240
feature 241
feature 242
feature 243
feature 244
feature 245
feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feature 270
feature 271
feature 272
feature 273
feature 274
feature 275
feature 276
feature 277
feature 278
feature 279
feature 280
feature 281
feature 282
feature 283
feature 284
feature 285
feature 286
feature 287
feature 288
feature 289
feature 290
feature 291
feature 292
feature 293
feat

In [213]:
df_crf = pd.DataFrame(best_score,columns=df_total.columns[:half],index = intervals)
df_crf

Unnamed: 0,Yunta50_open_1_1.0_r,Yunta50_open_1_2.0_r,Yunta50_open_1_3.0_r,Yunta50_open_1_4.0_r,Yunta50_high_1_1.0_r,Yunta50_high_1_2.0_r,Yunta50_high_1_3.0_r,Yunta50_high_1_4.0_r,Yunta50_low_1_1.0_r,Yunta50_low_1_2.0_r,...,DOI_high_1_3.0_r,DOI_high_1_4.0_r,DOI_low_1_1.0_r,DOI_low_1_2.0_r,DOI_low_1_3.0_r,DOI_low_1_4.0_r,DOI_close_1_1.0_r,DOI_close_1_2.0_r,DOI_close_1_3.0_r,DOI_close_1_4.0_r
0.0,0.514748,0.502379,0.500476,0.491912,0.515699,0.50333,0.505233,0.506185,0.500476,0.509039,...,0.506185,0.50333,0.507136,0.50333,0.506185,0.50333,0.507136,0.50333,0.506185,0.50333
0.1,0.543292,0.530923,0.543292,0.543292,0.543292,0.543292,0.543292,0.543292,0.532826,0.543292,...,0.543292,0.531874,0.543292,0.543292,0.543292,0.531874,0.543292,0.543292,0.543292,0.531874
0.25,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,...,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992,0.607992
0.5,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,...,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878,0.727878
0.75,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,...,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336,0.797336
1.0,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,...,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133,0.860133
1.5,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,...,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814,0.944814


In [214]:
df_crf.iloc[0,:][df_crf.iloc[0,:]>0.6]

SP_close_1_4.0_r    0.615604
DJ_close_1_4.0_r    0.603235
Name: 0.0, dtype: float64

In [215]:
arg_i1 = np.argsort(df_crf.iloc[0,:].values)[-10:]
df_crf.columns[arg_i1]

Index(['SP_low_1_1.0_r', 'Russell_close_1_4.0_r', 'DJ_close_1_1.0_r',
       'SP_close_1_1.0_r', 'VIX_close_1_1.0_r', 'VIX_close_1_4.0_r',
       'SOX_close_1_4.0_r', 'SP_high_1_4.0_r', 'DJ_close_1_4.0_r',
       'SP_close_1_4.0_r'],
      dtype='object')

In [216]:
arg_i1

array([176, 203, 220, 180, 256, 259, 243, 175, 223, 183], dtype=int64)

#### combine rf is better

## Investigation of ca

### get all feature model  

In [217]:
df_list = [SP,Russell,DJ,SOX,VIX]
df_option = [True,True,True,True,True]
df_name = ['SP','Russell','DJ','SOX','VIX']
# df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
# df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]

intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
ca = 10
window = 1

best_score = []
best_params = []
for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
    amount_of_features = len(df_total.columns)-3
    half = int(amount_of_features/2)
    score = []
    params = []
    for index in range(half):
        print('feature',index)
        train_X_f = train_X[:,index].reshape((-1,1))
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)

    best_score.append(score)
    best_params.append(params)


interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
featur

feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
fea

In [218]:
df_ca_am = pd.DataFrame(best_score,columns=df_total.columns[:half],index = intervals)
df_ca_am

Unnamed: 0,SP_open_1_1.0,SP_open_1_2.0,SP_open_1_3.0,SP_open_1_4.0,SP_open_1_5.0,SP_open_1_6.0,SP_open_1_7.0,SP_open_1_8.0,SP_open_1_9.0,SP_open_1_10.0,...,VIX_close_1_2.0,VIX_close_1_3.0,VIX_close_1_4.0,VIX_close_1_5.0,VIX_close_1_6.0,VIX_close_1_7.0,VIX_close_1_8.0,VIX_close_1_9.0,VIX_close_1_10.0,mean_rise_fall
0.0,0.498657,0.502238,0.498657,0.496867,0.499552,0.506714,0.4906,0.521038,0.505819,0.499552,...,0.541629,0.52462,0.518353,0.478066,0.506714,0.514772,0.508505,0.529991,0.546106,0.481647
0.1,0.533572,0.540734,0.504924,0.522829,0.540734,0.540734,0.540734,0.547001,0.544315,0.546106,...,0.565801,0.552372,0.54521,0.47538,0.540734,0.540734,0.540734,0.540734,0.52462,0.540734
0.25,0.609669,0.609669,0.609669,0.609669,0.609669,0.609669,0.609669,0.605192,0.609669,0.609669,...,0.627574,0.605192,0.609669,0.609669,0.609669,0.609669,0.609669,0.609669,0.609669,0.609669
0.5,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,...,0.722471,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052
0.75,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,...,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358
1.0,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,...,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131
1.5,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,...,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285


In [14]:
arg = np.argsort(df_ca_am.loc[intervals[0]].values)[-10:]
df_ca_am.loc[intervals[0]][arg].mean()

NameError: name 'df_ca_am' is not defined

### choose the good features and rebuild the model 

In [220]:
best_score = []
best_params = []
numbers = [1,5,10,20,100,0]
for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
    score = []
    params = []
    #select the best ten feature
    for number in numbers:
        arg = np.argsort(df_ca_am.loc[interval].values)[number*-1:]
        # train
        train_X_f = train_X[:,arg]
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)
    best_score.append(score)
    best_params.append(params)

interval is 0
interval is 0.1
interval is 0.25
interval is 0.5
interval is 0.75
interval is 1
interval is 1.5


In [221]:
df_ca_am_com = pd.DataFrame(best_score,columns=numbers,index = intervals)
df_ca_am_com

Unnamed: 0,1,5,10,20,100,0
0.0,0.554163,0.57923,0.587287,0.565801,0.571173,0.553268
0.1,0.584602,0.612355,0.61325,0.61504,0.59803,0.554163
0.25,0.649955,0.655327,0.660698,0.660698,0.648165,0.631155
0.5,0.730528,0.732319,0.730528,0.729633,0.726052,0.717995
0.75,0.800358,0.800358,0.800358,0.800358,0.802149,0.800358
1.0,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131
1.5,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285


In [222]:
df_ca_am_com.loc[intervals[0]].mean()

0.5684870188003581

##### combining the good feature(10 is better) makes the model better

### choose the feature by outlier 

In [223]:
df_list = [SP,Russell,DJ,SOX,VIX]
df_option = [True,True,True,True,True]
df_name = ['SP','Russell','DJ','SOX','VIX']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
cas = [10]
window = 1

ca_scores_15 = []
ca_params_15 = []
for ca in cas:
    best_score = []
    best_params = []
    for interval in intervals:
        print('interval is',interval)
        df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
        amount_of_features = len(df_total.columns)-3
        half = int(amount_of_features/2)
        score = []
        params = []
        for index in range(half):
            print('feature',index)
            if (df_total.columns[index].find('1.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half,
                                     index+6,index+6+half,index+7,index+7+half,index+8,index+8+half,index+9,index+9+half]]
#                 test_X_f = test_X[:,[index,index+half]]
#             elif (df_total.columns[index].find('7.0') != -1):
#                 train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
# #                 test_X_f = test_X[:,[index,index+half]]
            elif (df_total.columns[index].find('5.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half]]
#                 test_X_f = test_X[:,[index,index+half]]
            else:
                continue
            param_test1 = rf_params_test
            gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
            gsearch1.fit(train_X_f,train_rise)
            params.append(gsearch1.best_params_)
            score.append(gsearch1.best_score_)

    #         model = RandomForestClassifier()
    #         model.set_params(**gsearch1.best_params_)

        best_score.append(score)
        best_params.append(params)
    ca_scores_15.append(best_score)
    ca_params_15.append(best_params)

interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature

feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
fea

In [224]:
df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 2)
half = int((len(df_total.columns)-3)/2)

df_ca_15 = pd.DataFrame(ca_scores_15[0],columns=df_total.columns[:half-1],index = intervals)
df_ca_15

Unnamed: 0,SP_open_1_1.0,SP_open_1_2.0,SP_high_1_1.0,SP_high_1_2.0,SP_low_1_1.0,SP_low_1_2.0,SP_close_1_1.0,SP_close_1_2.0,SP_volume_1_1.0,SP_volume_1_2.0,...,SOX_close_1_1.0,SOX_close_1_2.0,VIX_open_1_1.0,VIX_open_1_2.0,VIX_high_1_1.0,VIX_high_1_2.0,VIX_low_1_1.0,VIX_low_1_2.0,VIX_close_1_1.0,VIX_close_1_2.0
0.0,0.512086,0.510295,0.561325,0.50761,0.592659,0.497762,0.632945,0.50761,0.495971,0.505819,...,0.588183,0.476276,0.556849,0.52462,0.566697,0.514772,0.567592,0.496867,0.617726,0.485228
0.1,0.530886,0.540734,0.590868,0.535363,0.589973,0.540734,0.640107,0.540734,0.529991,0.540734,...,0.572963,0.523724,0.557744,0.540734,0.584602,0.540734,0.588183,0.527305,0.617726,0.485228
0.25,0.609669,0.609669,0.648165,0.609669,0.638317,0.609669,0.682184,0.609669,0.609669,0.604297,...,0.634736,0.587287,0.620412,0.609669,0.63026,0.609669,0.641898,0.609669,0.648165,0.609669
0.5,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.730528,0.726052,0.726052,0.726052,...,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052
0.75,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,...,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358
1.0,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,...,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131
1.5,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,...,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285


In [225]:
arg = np.argsort(df_ca_15.loc[intervals[0]].values)[-10:]
df_ca_15.loc[intervals[0]][arg].mean()

0.5947179946284692

In [226]:
df_list = [SP,Russell,DJ,SOX,VIX]
df_option = [True,True,True,True,True]
df_name = ['SP','Russell','DJ','SOX','VIX']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
cas = [10]
window = 1

ca_scores_14 = []
ca_params_14 = []
for ca in cas:
    best_score = []
    best_params = []
    for interval in intervals:
        print('interval is',interval)
        df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
        amount_of_features = len(df_total.columns)-3
        half = int(amount_of_features/2)
        score = []
        params = []
        for index in range(half):
            print('feature',index)
            if (df_total.columns[index].find('1.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,
                                     index+7,index+7+half,index+8,index+8+half,index+9,index+9+half]]
#                 test_X_f = test_X[:,[index,index+half]]
#             elif (df_total.columns[index].find('7.0') != -1):
#                 train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
# #                 test_X_f = test_X[:,[index,index+half]]
            elif (df_total.columns[index].find('4.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
#                 test_X_f = test_X[:,[index,index+half]]
            else:
                continue
            param_test1 = rf_params_test
            gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
            gsearch1.fit(train_X_f,train_rise)
            params.append(gsearch1.best_params_)
            score.append(gsearch1.best_score_)

    #         model = RandomForestClassifier()
    #         model.set_params(**gsearch1.best_params_)

        best_score.append(score)
        best_params.append(params)
    ca_scores_14.append(best_score)
    ca_params_14.append(best_params)

interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 

feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
feature 90
feature 91
feature 92
feature 93
feature 94
feature 95
feature 96
feature 97
feature 98
feature 99
feature 100
feature 101
feature 102
feature 103
feature 104
feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
feature 129
feature 130
feature 131


In [227]:
df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 2)
half = int((len(df_total.columns)-3)/2)

df_ca_14 = pd.DataFrame(ca_scores_15[0],columns=df_total.columns[:half-1],index = intervals)
df_ca_14

Unnamed: 0,SP_open_1_1.0,SP_open_1_2.0,SP_high_1_1.0,SP_high_1_2.0,SP_low_1_1.0,SP_low_1_2.0,SP_close_1_1.0,SP_close_1_2.0,SP_volume_1_1.0,SP_volume_1_2.0,...,SOX_close_1_1.0,SOX_close_1_2.0,VIX_open_1_1.0,VIX_open_1_2.0,VIX_high_1_1.0,VIX_high_1_2.0,VIX_low_1_1.0,VIX_low_1_2.0,VIX_close_1_1.0,VIX_close_1_2.0
0.0,0.512086,0.510295,0.561325,0.50761,0.592659,0.497762,0.632945,0.50761,0.495971,0.505819,...,0.588183,0.476276,0.556849,0.52462,0.566697,0.514772,0.567592,0.496867,0.617726,0.485228
0.1,0.530886,0.540734,0.590868,0.535363,0.589973,0.540734,0.640107,0.540734,0.529991,0.540734,...,0.572963,0.523724,0.557744,0.540734,0.584602,0.540734,0.588183,0.527305,0.617726,0.485228
0.25,0.609669,0.609669,0.648165,0.609669,0.638317,0.609669,0.682184,0.609669,0.609669,0.604297,...,0.634736,0.587287,0.620412,0.609669,0.63026,0.609669,0.641898,0.609669,0.648165,0.609669
0.5,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.730528,0.726052,0.726052,0.726052,...,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052,0.726052
0.75,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,...,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358,0.800358
1.0,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,...,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131,0.862131
1.5,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,...,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285,0.946285


In [279]:
arg = np.argsort(df_ca_14.loc[intervals[0]].values)[-10:]
df_ca_14.loc[intervals[2]][arg]

VIX_high_1_1.0         0.630260
VIX_low_1_1.0          0.641898
Russell_low_1_1.0      0.632945
SOX_close_1_1.0        0.634736
SP_low_1_1.0           0.638317
Russell_close_1_1.0    0.645479
DJ_low_1_1.0           0.617726
DJ_close_1_1.0         0.669651
VIX_close_1_1.0        0.648165
SP_close_1_1.0         0.682184
Name: 0.25, dtype: float64

In [281]:
best_score = []
best_params = []
numbers = [1,2,3,5,10,20,0]
intervals = [0,0.1,0.25,0.5]

for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 10)
    score = []
    params = []
    #select the best ten feature
    for number in numbers:
        
        #find the good feature
        arg = np.argsort(df_ca_14.loc[interval].values)[number*-1:]
        col = list(df_ca_14.loc[intervals[0]][arg].index)
        index_list = []
        for c in col:
            if (c.find('1.0') != -1):
                for categroy in [1,2,3,8,9,10]:
                    index_list.append(c.replace('1.0',f'{categroy}.0'))
        indexs = [i for i,value in enumerate(df_total.columns.tolist()) if value in index_list]
        
        # train
        train_X_f = train_X[:,indexs]
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)
    best_score.append(score)
    best_params.append(params)

interval is 0
interval is 0.1
interval is 0.25
interval is 0.5


In [282]:
df_ca_14_com = pd.DataFrame(best_score,columns=numbers,index = intervals)
df_ca_14_com

Unnamed: 0,1,2,3,5,10,20,0
0.0,0.624888,0.580125,0.583706,0.584602,0.574754,0.578335,0.567592
0.1,0.640107,0.581916,0.61504,0.578335,0.607878,0.584602,0.557744
0.25,0.682184,0.662489,0.669651,0.666965,0.659803,0.623993,0.617726
0.5,0.726052,0.729633,0.726052,0.726052,0.726052,0.732319,0.73769


##### It's bad for this to combine the good features
## Conclusion
* choosing the feature by the outlier is better than choosing the feature by the performance in the American data
* when choosing the feature by the outlier,use the best feature is better than use the combine feature

# All data in ca 

In [283]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']


intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
ca = 10
window = 1

best_score = []
best_params = []
for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
    amount_of_features = len(df_total.columns)-3
    half = int(amount_of_features/2)
    score = []
    params = []
    for index in range(half):
        print('feature',index)
        train_X_f = train_X[:,index].reshape((-1,1))
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)

    best_score.append(score)
    best_params.append(params)


interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 693
feature 694
feature 695
feature 696
feature 697
feature 698
feature 699
feature 700
feature 701
feature 702
feature 703
feature 704
feature 705
feature 706
feature 707
feature 708
feature 709
feature 710
feature 711
feature 712
feature 713
feature 714
feature 715
feature 716
feature 717
feature 718
feature 719
feature 720
feature 721
feature 722
feature 723
feature 724
feature 725
feature 726
feature 727
feature 728
feature 729
feature 730
feature 731
feature 732
feature 733
feature 734
feature 735
feature 736
feature 737
feature 738
feature 739
feature 740
feature 741
feature 742
feature 743
feature 744
feature 745
feature 746
feature 747
feature 748
feature 749
feature 750
feature 751
feature 752
feature 753
feature 754
feature 755
feature 756
feature 757
feature 758
feature 759
feature 760
feature 761
feature 762
feature 763
feature 764
feature 765
feature 766
feature 767
feature 768
feature 769
feature 770
feature 771
feature 772
feature 773
feature 774
feature 775
feat

feature 595
feature 596
feature 597
feature 598
feature 599
feature 600
feature 601
feature 602
feature 603
feature 604
feature 605
feature 606
feature 607
feature 608
feature 609
feature 610
feature 611
feature 612
feature 613
feature 614
feature 615
feature 616
feature 617
feature 618
feature 619
feature 620
feature 621
feature 622
feature 623
feature 624
feature 625
feature 626
feature 627
feature 628
feature 629
feature 630
feature 631
feature 632
feature 633
feature 634
feature 635
feature 636
feature 637
feature 638
feature 639
feature 640
feature 641
feature 642
feature 643
feature 644
feature 645
feature 646
feature 647
feature 648
feature 649
feature 650
feature 651
feature 652
feature 653
feature 654
feature 655
feature 656
feature 657
feature 658
feature 659
feature 660
feature 661
feature 662
feature 663
feature 664
feature 665
feature 666
feature 667
feature 668
feature 669
feature 670
feature 671
feature 672
feature 673
feature 674
feature 675
feature 676
feature 677
feat

feature 396
feature 397
feature 398
feature 399
feature 400
feature 401
feature 402
feature 403
feature 404
feature 405
feature 406
feature 407
feature 408
feature 409
feature 410
feature 411
feature 412
feature 413
feature 414
feature 415
feature 416
feature 417
feature 418
feature 419
feature 420
feature 421
feature 422
feature 423
feature 424
feature 425
feature 426
feature 427
feature 428
feature 429
feature 430
feature 431
feature 432
feature 433
feature 434
feature 435
feature 436
feature 437
feature 438
feature 439
feature 440
feature 441
feature 442
feature 443
feature 444
feature 445
feature 446
feature 447
feature 448
feature 449
feature 450
feature 451
feature 452
feature 453
feature 454
feature 455
feature 456
feature 457
feature 458
feature 459
feature 460
feature 461
feature 462
feature 463
feature 464
feature 465
feature 466
feature 467
feature 468
feature 469
feature 470
feature 471
feature 472
feature 473
feature 474
feature 475
feature 476
feature 477
feature 478
feat

feature 299
feature 300
feature 301
feature 302
feature 303
feature 304
feature 305
feature 306
feature 307
feature 308
feature 309
feature 310
feature 311
feature 312
feature 313
feature 314
feature 315
feature 316
feature 317
feature 318
feature 319
feature 320
feature 321
feature 322
feature 323
feature 324
feature 325
feature 326
feature 327
feature 328
feature 329
feature 330
feature 331
feature 332
feature 333
feature 334
feature 335
feature 336
feature 337
feature 338
feature 339
feature 340
feature 341
feature 342
feature 343
feature 344
feature 345
feature 346
feature 347
feature 348
feature 349
feature 350
feature 351
feature 352
feature 353
feature 354
feature 355
feature 356
feature 357
feature 358
feature 359
feature 360
feature 361
feature 362
feature 363
feature 364
feature 365
feature 366
feature 367
feature 368
feature 369
feature 370
feature 371
feature 372
feature 373
feature 374
feature 375
feature 376
feature 377
feature 378
feature 379
feature 380
feature 381
feat

feature 202
feature 203
feature 204
feature 205
feature 206
feature 207
feature 208
feature 209
feature 210
feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feature 217
feature 218
feature 219
feature 220
feature 221
feature 222
feature 223
feature 224
feature 225
feature 226
feature 227
feature 228
feature 229
feature 230
feature 231
feature 232
feature 233
feature 234
feature 235
feature 236
feature 237
feature 238
feature 239
feature 240
feature 241
feature 242
feature 243
feature 244
feature 245
feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feature 270
feature 271
feature 272
feature 273
feature 274
feature 275
feature 276
feature 277
feature 278
feature 279
feature 280
feature 281
feature 282
feature 283
feature 284
feat

feature 105
feature 106
feature 107
feature 108
feature 109
feature 110
feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
feature 129
feature 130
feature 131
feature 132
feature 133
feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
feature 159
feature 160
feature 161
feature 162
feature 163
feature 164
feature 165
feature 166
feature 167
feature 168
feature 169
feature 170
feature 171
feature 172
feature 173
feature 174
feature 175
feature 176
feature 177
feature 178
feature 179
feature 180
feature 181
feature 182
feature 183
feature 184
feature 185
feature 186
feature 187
feat

feature 788
feature 789
feature 790


In [284]:
df_ca_am = pd.DataFrame(best_score,columns=df_total.columns[:half],index = intervals)
df_ca_am

Unnamed: 0,Yunta50_open_1_1.0,Yunta50_open_1_2.0,Yunta50_open_1_3.0,Yunta50_open_1_4.0,Yunta50_open_1_5.0,Yunta50_open_1_6.0,Yunta50_open_1_7.0,Yunta50_open_1_8.0,Yunta50_open_1_9.0,Yunta50_open_1_10.0,...,DOI_close_1_2.0,DOI_close_1_3.0,DOI_close_1_4.0,DOI_close_1_5.0,DOI_close_1_6.0,DOI_close_1_7.0,DOI_close_1_8.0,DOI_close_1_9.0,DOI_close_1_10.0,mean_rise_fall
0.0,0.498279,0.499139,0.499139,0.499139,0.516351,0.499139,0.499139,0.506024,0.493115,0.499139,...,0.502582,0.499139,0.499139,0.506024,0.499139,0.5,0.499139,0.499139,0.499139,0.499139
0.1,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.536145,0.537005,0.537005,...,0.536145,0.537005,0.537005,0.537005,0.537005,0.537005,0.53012,0.537005,0.537005,0.537005
0.25,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.591222,0.605852,0.605852,...,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852
0.5,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,...,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031
0.75,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,...,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041
1.0,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,...,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585
1.5,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,...,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644


In [285]:
arg = np.argsort(df_ca_am.loc[intervals[0]].values)[-10:]
df_ca_am.loc[intervals[0]][arg].mean()

0.5425989672977625

### choose the good features and rebuild the model 

In [286]:
best_score = []
best_params = []
numbers = [1,5,10,20,100,0]
for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
    score = []
    params = []
    #select the best ten feature
    for number in numbers:
        arg = np.argsort(df_ca_am.loc[interval].values)[number*-1:]
        # train
        train_X_f = train_X[:,arg]
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)
    best_score.append(score)
    best_params.append(params)

interval is 0
interval is 0.1
interval is 0.25
interval is 0.5
interval is 0.75
interval is 1
interval is 1.5


In [287]:
df_ca_am_com = pd.DataFrame(best_score,columns=numbers,index = intervals)
df_ca_am_com

Unnamed: 0,1,5,10,20,100,0
0.0,0.547332,0.57401,0.57401,0.585198,0.561102,0.545611
0.1,0.579174,0.608434,0.593804,0.60241,0.562823,0.563683
0.25,0.644578,0.644578,0.639415,0.642857,0.645439,0.604131
0.5,0.73494,0.734079,0.736661,0.726334,0.722031,0.694492
0.75,0.796041,0.796041,0.797762,0.796041,0.796041,0.783133
1.0,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585
1.5,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644


In [288]:
df_ca_am_com.loc[intervals[0]].mean()

0.5645438898450946

##### combining the good feature(10 is better) makes the model better

### choose the feature by outlier 

In [289]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
cas = [10]
window = 1

ca_scores_15 = []
ca_params_15 = []
for ca in cas:
    best_score = []
    best_params = []
    for interval in intervals:
        print('interval is',interval)
        df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
        amount_of_features = len(df_total.columns)-3
        half = int(amount_of_features/2)
        score = []
        params = []
        for index in range(half):
            print('feature',index)
            if (df_total.columns[index].find('1.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half,
                                     index+6,index+6+half,index+7,index+7+half,index+8,index+8+half,index+9,index+9+half]]
#                 test_X_f = test_X[:,[index,index+half]]
#             elif (df_total.columns[index].find('7.0') != -1):
#                 train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
# #                 test_X_f = test_X[:,[index,index+half]]
            elif (df_total.columns[index].find('5.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half]]
#                 test_X_f = test_X[:,[index,index+half]]
            else:
                continue
            param_test1 = rf_params_test
            gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
            gsearch1.fit(train_X_f,train_rise)
            params.append(gsearch1.best_params_)
            score.append(gsearch1.best_score_)

    #         model = RandomForestClassifier()
    #         model.set_params(**gsearch1.best_params_)

        best_score.append(score)
        best_params.append(params)
    ca_scores_15.append(best_score)
    ca_params_15.append(best_params)

interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 695
feature 696
feature 697
feature 698
feature 699
feature 700
feature 701
feature 702
feature 703
feature 704
feature 705
feature 706
feature 707
feature 708
feature 709
feature 710
feature 711
feature 712
feature 713
feature 714
feature 715
feature 716
feature 717
feature 718
feature 719
feature 720
feature 721
feature 722
feature 723
feature 724
feature 725
feature 726
feature 727
feature 728
feature 729
feature 730
feature 731
feature 732
feature 733
feature 734
feature 735
feature 736
feature 737
feature 738
feature 739
feature 740
feature 741
feature 742
feature 743
feature 744
feature 745
feature 746
feature 747
feature 748
feature 749
feature 750
feature 751
feature 752
feature 753
feature 754
feature 755
feature 756
feature 757
feature 758
feature 759
feature 760
feature 761
feature 762
feature 763
feature 764
feature 765
feature 766
feature 767
feature 768
feature 769
feature 770
feature 771
feature 772
feature 773
feature 774
feature 775
feature 776
feature 777
feat

feature 595
feature 596
feature 597
feature 598
feature 599
feature 600
feature 601
feature 602
feature 603
feature 604
feature 605
feature 606
feature 607
feature 608
feature 609
feature 610
feature 611
feature 612
feature 613
feature 614
feature 615
feature 616
feature 617
feature 618
feature 619
feature 620
feature 621
feature 622
feature 623
feature 624
feature 625
feature 626
feature 627
feature 628
feature 629
feature 630
feature 631
feature 632
feature 633
feature 634
feature 635
feature 636
feature 637
feature 638
feature 639
feature 640
feature 641
feature 642
feature 643
feature 644
feature 645
feature 646
feature 647
feature 648
feature 649
feature 650
feature 651
feature 652
feature 653
feature 654
feature 655
feature 656
feature 657
feature 658
feature 659
feature 660
feature 661
feature 662
feature 663
feature 664
feature 665
feature 666
feature 667
feature 668
feature 669
feature 670
feature 671
feature 672
feature 673
feature 674
feature 675
feature 676
feature 677
feat

feature 501
feature 502
feature 503
feature 504
feature 505
feature 506
feature 507
feature 508
feature 509
feature 510
feature 511
feature 512
feature 513
feature 514
feature 515
feature 516
feature 517
feature 518
feature 519
feature 520
feature 521
feature 522
feature 523
feature 524
feature 525
feature 526
feature 527
feature 528
feature 529
feature 530
feature 531
feature 532
feature 533
feature 534
feature 535
feature 536
feature 537
feature 538
feature 539
feature 540
feature 541
feature 542
feature 543
feature 544
feature 545
feature 546
feature 547
feature 548
feature 549
feature 550
feature 551
feature 552
feature 553
feature 554
feature 555
feature 556
feature 557
feature 558
feature 559
feature 560
feature 561
feature 562
feature 563
feature 564
feature 565
feature 566
feature 567
feature 568
feature 569
feature 570
feature 571
feature 572
feature 573
feature 574
feature 575
feature 576
feature 577
feature 578
feature 579
feature 580
feature 581
feature 582
feature 583
feat

feature 401
feature 402
feature 403
feature 404
feature 405
feature 406
feature 407
feature 408
feature 409
feature 410
feature 411
feature 412
feature 413
feature 414
feature 415
feature 416
feature 417
feature 418
feature 419
feature 420
feature 421
feature 422
feature 423
feature 424
feature 425
feature 426
feature 427
feature 428
feature 429
feature 430
feature 431
feature 432
feature 433
feature 434
feature 435
feature 436
feature 437
feature 438
feature 439
feature 440
feature 441
feature 442
feature 443
feature 444
feature 445
feature 446
feature 447
feature 448
feature 449
feature 450
feature 451
feature 452
feature 453
feature 454
feature 455
feature 456
feature 457
feature 458
feature 459
feature 460
feature 461
feature 462
feature 463
feature 464
feature 465
feature 466
feature 467
feature 468
feature 469
feature 470
feature 471
feature 472
feature 473
feature 474
feature 475
feature 476
feature 477
feature 478
feature 479
feature 480
feature 481
feature 482
feature 483
feat

feature 311
feature 312
feature 313
feature 314
feature 315
feature 316
feature 317
feature 318
feature 319
feature 320
feature 321
feature 322
feature 323
feature 324
feature 325
feature 326
feature 327
feature 328
feature 329
feature 330
feature 331
feature 332
feature 333
feature 334
feature 335
feature 336
feature 337
feature 338
feature 339
feature 340
feature 341
feature 342
feature 343
feature 344
feature 345
feature 346
feature 347
feature 348
feature 349
feature 350
feature 351
feature 352
feature 353
feature 354
feature 355
feature 356
feature 357
feature 358
feature 359
feature 360
feature 361
feature 362
feature 363
feature 364
feature 365
feature 366
feature 367
feature 368
feature 369
feature 370
feature 371
feature 372
feature 373
feature 374
feature 375
feature 376
feature 377
feature 378
feature 379
feature 380
feature 381
feature 382
feature 383
feature 384
feature 385
feature 386
feature 387
feature 388
feature 389
feature 390
feature 391
feature 392
feature 393
feat

feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feature 217
feature 218
feature 219
feature 220
feature 221
feature 222
feature 223
feature 224
feature 225
feature 226
feature 227
feature 228
feature 229
feature 230
feature 231
feature 232
feature 233
feature 234
feature 235
feature 236
feature 237
feature 238
feature 239
feature 240
feature 241
feature 242
feature 243
feature 244
feature 245
feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feature 270
feature 271
feature 272
feature 273
feature 274
feature 275
feature 276
feature 277
feature 278
feature 279
feature 280
feature 281
feature 282
feature 283
feature 284
feature 285
feature 286
feature 287
feature 288
feature 289
feature 290
feature 291
feature 292
feature 293
feat

feature 111
feature 112
feature 113
feature 114
feature 115
feature 116
feature 117
feature 118
feature 119
feature 120
feature 121
feature 122
feature 123
feature 124
feature 125
feature 126
feature 127
feature 128
feature 129
feature 130
feature 131
feature 132
feature 133
feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
feature 159
feature 160
feature 161
feature 162
feature 163
feature 164
feature 165
feature 166
feature 167
feature 168
feature 169
feature 170
feature 171
feature 172
feature 173
feature 174
feature 175
feature 176
feature 177
feature 178
feature 179
feature 180
feature 181
feature 182
feature 183
feature 184
feature 185
feature 186
feature 187
feature 188
feature 189
feature 190
feature 191
feature 192
feature 193
feat

In [290]:
df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 2)
half = int((len(df_total.columns)-3)/2)

df_ca_15 = pd.DataFrame(ca_scores_15[0],columns=df_total.columns[:half-1],index = intervals)
df_ca_15

Unnamed: 0,Yunta50_open_1_1.0,Yunta50_open_1_2.0,Yunta50_high_1_1.0,Yunta50_high_1_2.0,Yunta50_low_1_1.0,Yunta50_low_1_2.0,Yunta50_close_1_1.0,Yunta50_close_1_2.0,Yunta50_volume_1_1.0,Yunta50_volume_1_2.0,...,IOI_close_1_1.0,IOI_close_1_2.0,DOI_open_1_1.0,DOI_open_1_2.0,DOI_high_1_1.0,DOI_high_1_2.0,DOI_low_1_1.0,DOI_low_1_2.0,DOI_close_1_1.0,DOI_close_1_2.0
0.0,0.505164,0.516351,0.498279,0.493115,0.490534,0.499139,0.517212,0.500861,0.506024,0.499139,...,0.499139,0.499139,0.510327,0.506024,0.510327,0.506024,0.510327,0.506024,0.510327,0.506024
0.1,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,...,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005
0.25,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,...,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852
0.5,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,...,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031
0.75,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,...,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041
1.0,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,...,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585
1.5,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,...,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644


In [291]:
arg = np.argsort(df_ca_15.loc[intervals[0]].values)[-10:]
df_ca_15.loc[intervals[0]][arg].mean()

0.5896729776247848

In [292]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
cas = [10]
window = 1

ca_scores_14 = []
ca_params_14 = []
for ca in cas:
    best_score = []
    best_params = []
    for interval in intervals:
        print('interval is',interval)
        df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = ca)
        amount_of_features = len(df_total.columns)-3
        half = int(amount_of_features/2)
        score = []
        params = []
        for index in range(half):
            print('feature',index)
            if (df_total.columns[index].find('1.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,
                                     index+7,index+7+half,index+8,index+8+half,index+9,index+9+half]]
#                 test_X_f = test_X[:,[index,index+half]]
#             elif (df_total.columns[index].find('7.0') != -1):
#                 train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
# #                 test_X_f = test_X[:,[index,index+half]]
            elif (df_total.columns[index].find('4.0') != -1):
                train_X_f = train_X[:,[index,index+half,index+1,index+1+half,index+2,index+2+half,index+3,index+3+half]]
#                 test_X_f = test_X[:,[index,index+half]]
            else:
                continue
            param_test1 = rf_params_test
            gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
            gsearch1.fit(train_X_f,train_rise)
            params.append(gsearch1.best_params_)
            score.append(gsearch1.best_score_)

    #         model = RandomForestClassifier()
    #         model.set_params(**gsearch1.best_params_)

        best_score.append(score)
        best_params.append(params)
    ca_scores_14.append(best_score)
    ca_params_14.append(best_params)

interval is 0
feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
feature 17
feature 18
feature 19
feature 20
feature 21
feature 22
feature 23
feature 24
feature 25
feature 26
feature 27
feature 28
feature 29
feature 30
feature 31
feature 32
feature 33
feature 34
feature 35
feature 36
feature 37
feature 38
feature 39
feature 40
feature 41
feature 42
feature 43
feature 44
feature 45
feature 46
feature 47
feature 48
feature 49
feature 50
feature 51
feature 52
feature 53
feature 54
feature 55
feature 56
feature 57
feature 58
feature 59
feature 60
feature 61
feature 62
feature 63
feature 64
feature 65
feature 66
feature 67
feature 68
feature 69
feature 70
feature 71
feature 72
feature 73
feature 74
feature 75
feature 76
feature 77
feature 78
feature 79
feature 80
feature 81
feature 82
feature 83
feature 84
feature 85
feature 86
feature 87
feature 88
feature 89
featur

feature 691
feature 692
feature 693
feature 694
feature 695
feature 696
feature 697
feature 698
feature 699
feature 700
feature 701
feature 702
feature 703
feature 704
feature 705
feature 706
feature 707
feature 708
feature 709
feature 710
feature 711
feature 712
feature 713
feature 714
feature 715
feature 716
feature 717
feature 718
feature 719
feature 720
feature 721
feature 722
feature 723
feature 724
feature 725
feature 726
feature 727
feature 728
feature 729
feature 730
feature 731
feature 732
feature 733
feature 734
feature 735
feature 736
feature 737
feature 738
feature 739
feature 740
feature 741
feature 742
feature 743
feature 744
feature 745
feature 746
feature 747
feature 748
feature 749
feature 750
feature 751
feature 752
feature 753
feature 754
feature 755
feature 756
feature 757
feature 758
feature 759
feature 760
feature 761
feature 762
feature 763
feature 764
feature 765
feature 766
feature 767
feature 768
feature 769
feature 770
feature 771
feature 772
feature 773
feat

feature 594
feature 595
feature 596
feature 597
feature 598
feature 599
feature 600
feature 601
feature 602
feature 603
feature 604
feature 605
feature 606
feature 607
feature 608
feature 609
feature 610
feature 611
feature 612
feature 613
feature 614
feature 615
feature 616
feature 617
feature 618
feature 619
feature 620
feature 621
feature 622
feature 623
feature 624
feature 625
feature 626
feature 627
feature 628
feature 629
feature 630
feature 631
feature 632
feature 633
feature 634
feature 635
feature 636
feature 637
feature 638
feature 639
feature 640
feature 641
feature 642
feature 643
feature 644
feature 645
feature 646
feature 647
feature 648
feature 649
feature 650
feature 651
feature 652
feature 653
feature 654
feature 655
feature 656
feature 657
feature 658
feature 659
feature 660
feature 661
feature 662
feature 663
feature 664
feature 665
feature 666
feature 667
feature 668
feature 669
feature 670
feature 671
feature 672
feature 673
feature 674
feature 675
feature 676
feat

feature 494
feature 495
feature 496
feature 497
feature 498
feature 499
feature 500
feature 501
feature 502
feature 503
feature 504
feature 505
feature 506
feature 507
feature 508
feature 509
feature 510
feature 511
feature 512
feature 513
feature 514
feature 515
feature 516
feature 517
feature 518
feature 519
feature 520
feature 521
feature 522
feature 523
feature 524
feature 525
feature 526
feature 527
feature 528
feature 529
feature 530
feature 531
feature 532
feature 533
feature 534
feature 535
feature 536
feature 537
feature 538
feature 539
feature 540
feature 541
feature 542
feature 543
feature 544
feature 545
feature 546
feature 547
feature 548
feature 549
feature 550
feature 551
feature 552
feature 553
feature 554
feature 555
feature 556
feature 557
feature 558
feature 559
feature 560
feature 561
feature 562
feature 563
feature 564
feature 565
feature 566
feature 567
feature 568
feature 569
feature 570
feature 571
feature 572
feature 573
feature 574
feature 575
feature 576
feat

feature 404
feature 405
feature 406
feature 407
feature 408
feature 409
feature 410
feature 411
feature 412
feature 413
feature 414
feature 415
feature 416
feature 417
feature 418
feature 419
feature 420
feature 421
feature 422
feature 423
feature 424
feature 425
feature 426
feature 427
feature 428
feature 429
feature 430
feature 431
feature 432
feature 433
feature 434
feature 435
feature 436
feature 437
feature 438
feature 439
feature 440
feature 441
feature 442
feature 443
feature 444
feature 445
feature 446
feature 447
feature 448
feature 449
feature 450
feature 451
feature 452
feature 453
feature 454
feature 455
feature 456
feature 457
feature 458
feature 459
feature 460
feature 461
feature 462
feature 463
feature 464
feature 465
feature 466
feature 467
feature 468
feature 469
feature 470
feature 471
feature 472
feature 473
feature 474
feature 475
feature 476
feature 477
feature 478
feature 479
feature 480
feature 481
feature 482
feature 483
feature 484
feature 485
feature 486
feat

feature 314
feature 315
feature 316
feature 317
feature 318
feature 319
feature 320
feature 321
feature 322
feature 323
feature 324
feature 325
feature 326
feature 327
feature 328
feature 329
feature 330
feature 331
feature 332
feature 333
feature 334
feature 335
feature 336
feature 337
feature 338
feature 339
feature 340
feature 341
feature 342
feature 343
feature 344
feature 345
feature 346
feature 347
feature 348
feature 349
feature 350
feature 351
feature 352
feature 353
feature 354
feature 355
feature 356
feature 357
feature 358
feature 359
feature 360
feature 361
feature 362
feature 363
feature 364
feature 365
feature 366
feature 367
feature 368
feature 369
feature 370
feature 371
feature 372
feature 373
feature 374
feature 375
feature 376
feature 377
feature 378
feature 379
feature 380
feature 381
feature 382
feature 383
feature 384
feature 385
feature 386
feature 387
feature 388
feature 389
feature 390
feature 391
feature 392
feature 393
feature 394
feature 395
feature 396
feat

feature 224
feature 225
feature 226
feature 227
feature 228
feature 229
feature 230
feature 231
feature 232
feature 233
feature 234
feature 235
feature 236
feature 237
feature 238
feature 239
feature 240
feature 241
feature 242
feature 243
feature 244
feature 245
feature 246
feature 247
feature 248
feature 249
feature 250
feature 251
feature 252
feature 253
feature 254
feature 255
feature 256
feature 257
feature 258
feature 259
feature 260
feature 261
feature 262
feature 263
feature 264
feature 265
feature 266
feature 267
feature 268
feature 269
feature 270
feature 271
feature 272
feature 273
feature 274
feature 275
feature 276
feature 277
feature 278
feature 279
feature 280
feature 281
feature 282
feature 283
feature 284
feature 285
feature 286
feature 287
feature 288
feature 289
feature 290
feature 291
feature 292
feature 293
feature 294
feature 295
feature 296
feature 297
feature 298
feature 299
feature 300
feature 301
feature 302
feature 303
feature 304
feature 305
feature 306
feat

feature 134
feature 135
feature 136
feature 137
feature 138
feature 139
feature 140
feature 141
feature 142
feature 143
feature 144
feature 145
feature 146
feature 147
feature 148
feature 149
feature 150
feature 151
feature 152
feature 153
feature 154
feature 155
feature 156
feature 157
feature 158
feature 159
feature 160
feature 161
feature 162
feature 163
feature 164
feature 165
feature 166
feature 167
feature 168
feature 169
feature 170
feature 171
feature 172
feature 173
feature 174
feature 175
feature 176
feature 177
feature 178
feature 179
feature 180
feature 181
feature 182
feature 183
feature 184
feature 185
feature 186
feature 187
feature 188
feature 189
feature 190
feature 191
feature 192
feature 193
feature 194
feature 195
feature 196
feature 197
feature 198
feature 199
feature 200
feature 201
feature 202
feature 203
feature 204
feature 205
feature 206
feature 207
feature 208
feature 209
feature 210
feature 211
feature 212
feature 213
feature 214
feature 215
feature 216
feat

In [293]:
df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 2)
half = int((len(df_total.columns)-3)/2)

df_ca_14 = pd.DataFrame(ca_scores_15[0],columns=df_total.columns[:half-1],index = intervals)
df_ca_14

Unnamed: 0,Yunta50_open_1_1.0,Yunta50_open_1_2.0,Yunta50_high_1_1.0,Yunta50_high_1_2.0,Yunta50_low_1_1.0,Yunta50_low_1_2.0,Yunta50_close_1_1.0,Yunta50_close_1_2.0,Yunta50_volume_1_1.0,Yunta50_volume_1_2.0,...,IOI_close_1_1.0,IOI_close_1_2.0,DOI_open_1_1.0,DOI_open_1_2.0,DOI_high_1_1.0,DOI_high_1_2.0,DOI_low_1_1.0,DOI_low_1_2.0,DOI_close_1_1.0,DOI_close_1_2.0
0.0,0.505164,0.516351,0.498279,0.493115,0.490534,0.499139,0.517212,0.500861,0.506024,0.499139,...,0.499139,0.499139,0.510327,0.506024,0.510327,0.506024,0.510327,0.506024,0.510327,0.506024
0.1,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,...,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005,0.537005
0.25,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,...,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852,0.605852
0.5,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,...,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031,0.722031
0.75,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,...,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041,0.796041
1.0,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,...,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585,0.860585
1.5,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,...,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644,0.946644


In [294]:
arg = np.argsort(df_ca_14.loc[intervals[0]].values)[-10:]
df_ca_14.loc[intervals[2]][arg]

SOX_close_1_1.0        0.629088
Russell_low_1_1.0      0.636833
SOX_low_1_1.0          0.635112
DJ_high_1_1.0          0.650602
SP_low_1_1.0           0.634251
VIX_close_1_1.0        0.624785
Russell_close_1_1.0    0.636833
DJ_low_1_1.0           0.605852
DJ_close_1_1.0         0.660929
SP_close_1_1.0         0.656627
Name: 0.25, dtype: float64

In [295]:
best_score = []
best_params = []
numbers = [1,2,3,5,10,20,0]
intervals = [0,0.1,0.25,0.5]

for interval in intervals:
    print('interval is',interval)
    df_total,train_X ,train_rise,train_fall,test_X,test_rise,test_fall,trf = get_dataframe_feature(df_list,df_option,Yunta50_rate,interval,window,ca = 10)
    score = []
    params = []
    #select the best ten feature
    for number in numbers:
        
        #find the good feature
        arg = np.argsort(df_ca_14.loc[interval].values)[number*-1:]
        col = list(df_ca_14.loc[intervals[0]][arg].index)
        index_list = []
        for c in col:
            if (c.find('1.0') != -1):
                for categroy in [1,2,3,8,9,10]:
                    index_list.append(c.replace('1.0',f'{categroy}.0'))
        indexs = [i for i,value in enumerate(df_total.columns.tolist()) if value in index_list]
        
        # train
        train_X_f = train_X[:,indexs]
        param_test1 = rf_params_test
        gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='accuracy',cv=2)
        gsearch1.fit(train_X_f,train_rise)
        params.append(gsearch1.best_params_)
        score.append(gsearch1.best_score_)

#         model = RandomForestClassifier()
#         model.set_params(**gsearch1.best_params_)
    best_score.append(score)
    best_params.append(params)

interval is 0
interval is 0.1
interval is 0.25
interval is 0.5


In [296]:
df_ca_14_com = pd.DataFrame(best_score,columns=numbers,index = intervals)
df_ca_14_com

Unnamed: 0,1,2,3,5,10,20,0
0.0,0.615318,0.583477,0.578313,0.572289,0.582616,0.563683,0.547332
0.1,0.623924,0.604991,0.60241,0.592943,0.593804,0.57401,0.545611
0.25,0.659208,0.645439,0.651463,0.651463,0.630809,0.623924,0.612737
0.5,0.73494,0.73494,0.738382,0.733219,0.729776,0.732358,0.722031


##### It's bad for this to combine the good features
## Conclusion
* choosing the feature by the outlier is better than choosing the feature by the performance in the American data
* when choosing the feature by the outlier,use the best feature is better than use the combine feature