In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib as mpl
%matplotlib inline
mpl.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

import time
import datetime as dt

from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,\
                              GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import math
import statistics



In [2]:
from data_processing import *

# Data

In [3]:
data = pd.read_csv("tsharep.csv",encoding=' big5-hkscs ').rename(columns={'代碼':'code','日期':'date','中文簡稱':'name','開盤價(元)':'open','最高價(元)':'high','最低價(元)':'low','收盤價(元)':'close','成交張數(張)':'volume'})
Taiwan50 = extract_target_index('ETF50.xlsx',data)
TXF = load_txt_TXF('TXF1-300-分鐘.txt',Taiwan50)
EXF = load_txt_2('EXF1-1-日.txt',Taiwan50)
FXF = load_txt_2('FXF1-1-日.txt',Taiwan50)
E_F = EXF/FXF
Nikkei = load_csv('Nikkei225.csv',Taiwan50)
VIX = load_csv('VIX.csv',Taiwan50).drop(['volume'],axis = 1)
Russell = load_csv('Russell2000.csv',Taiwan50)
SP = load_csv('S&P500.csv',Taiwan50)
DJ = load_csv('Dow Jones.csv',Taiwan50)
SOX = load_csv('SOX.csv',Taiwan50).drop(['volume'],axis = 1)
pcr = load_txt('put_call_ratio-日-成交價.txt',Taiwan50).drop(['volume','open','low'],axis = 1)
FOI = load_txt('外資未平倉-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
IOI = load_txt('投信未平倉-日-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
DOI = load_txt('自營未平倉-日-成交價.txt',Taiwan50).drop(['volume'],axis = 1)
orderbook_ = pd.read_hdf('twse_orderbook_comp.h5').rename(columns = {'時間':'date',
                                                                    '累積委託買進筆數':'ask_vol',
                                                                    '累積委託買進數量':'ask_count',
                                                                    '累積委託賣出筆數':'bid_vol',
                                                                    '累積委託賣出數量':'bid_count',
                                                                    '累積成交筆數':'vol',
                                                                    '累積成交數量':'count',
                                                                    '累積成交金額':'amount'})
orderbook = orderbook_[orderbook_.index==0].iloc[-1*Taiwan50.shape[0]-1:-1]
orderbook['date'] = Taiwan50.index.values
orderbook = orderbook.groupby('date').sum().drop(['vol','count','amount'],axis = 1)
ETFtable = ETF_data_processing('tetfp.csv')
Yunta50 = extract_target_ETF(ETFtable,50)
Yunta50_rate = get_change_rate(Yunta50)
oil_price = pd.read_csv('oil_price.csv')
oil_price['date'] = oil_price['date'].map(lambda x:dt.datetime.strptime(str(x),'%Y/%m/%d'))
oil_price = oil_price.set_index('date')

fiveyear_bond = pd.read_excel('five_year_bond.xls').rename(columns = {'observation_date':'date'}).set_index('date')
tenyear_bond = pd.read_excel('ten_year_bond.xls').rename(columns = {'observation_date':'date'}).set_index('date')
bond = pd.concat([fiveyear_bond,tenyear_bond],axis = 1).loc[Taiwan50.index[0]:Taiwan50.index[-1]].rename(columns = {'DGS5':'DGSfive'
                                                                                                                    ,'DGS10':'DGSten'})
mrf = pd.Series(get_mean_rf(Yunta50,'close',20,5),index=Yunta50.index)

In [4]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]

In [5]:
Yunta50.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,54.0,54.65,53.9,54.4,16487.0
2013-01-03,54.9,55.05,54.65,54.85,29020.0
2013-01-04,54.85,54.85,54.4,54.5,9837.0
2013-01-07,54.55,54.55,53.9,54.25,8910.0
2013-01-08,54.0,54.2,53.65,53.9,12507.0


In [6]:
pd.concat(df_list,axis = 1).dropna(axis = 0,how = 'any').head()

Unnamed: 0_level_0,open,high,low,close,volume,i_open,i_high,i_low,i_close,i_volume,...,low,close,open,high,low,close,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-04,54.85,54.85,54.4,54.5,9837.0,95.923933,96.701394,94.465837,95.679435,27253.8958,...,-1545.0,-1545.0,760.0,760.0,760.0,760.0,3495.0,3495.0,3495.0,3495.0
2013-01-07,54.55,54.55,53.9,54.25,8910.0,95.366814,95.490941,92.904071,93.766549,25203.1961,...,-3215.0,-3215.0,689.0,689.0,689.0,689.0,4306.0,4306.0,4306.0,4306.0
2013-01-08,54.0,54.2,53.65,53.9,12507.0,93.298728,93.83811,92.366784,93.389683,20860.6399,...,-5085.0,-5085.0,514.0,514.0,514.0,514.0,4156.0,4156.0,4156.0,4156.0
2013-01-09,53.75,54.3,53.75,54.1,7529.0,93.757349,94.742786,92.984312,93.901249,22238.8329,...,1160.0,1160.0,567.0,567.0,567.0,567.0,1334.0,1334.0,1334.0,1334.0
2013-01-10,54.3,54.65,54.15,54.5,13953.0,94.421509,95.201896,93.266828,94.43672,31815.6312,...,2262.0,2262.0,977.0,977.0,977.0,977.0,896.0,896.0,896.0,896.0


# Model


In [7]:
models  =  {
    'DecisionTreeClassifier' : DecisionTreeClassifier(random_state = 0),
         'RandomForestClassifier': RandomForestClassifier(random_state = 0),
#     'ExtraTreesClassifier': ExtraTreesClassifier(random_state = 0),
#     'AdaBoostClassifier': AdaBoostClassifier(base_estimator = DecisionTreeClassifier(),\
#                                              n_estimators = 10,random_state = 0),
#     'GradientBoostingClassifier': GradientBoostingClassifier(random_state = 0),
#     'SVC': SVC(probability=True,random_state = 0),
}
    
model_grid_params = {
        'DecisionTreeClassifier':{'max_depth': [None,1,5,10],'min_samples_leaf': [1,2,5,10]},
    'RandomForestClassifier': {'max_features':[None],'n_estimators':[5,10],'max_depth':[2,10],\
                               'min_samples_split':[2],'criterion':['entropy'],\
                               'min_samples_leaf':[3]},
#     'ExtraTreesClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
#                              'min_samples_split':[2],'criterion':['entropy'],\
#                              'min_samples_leaf':[3]},
#     'AdaBoostClassifier': {"base_estimator__criterion" : ["entropy"],\
#                            "base_estimator__max_depth": [None],\
#                            "base_estimator__min_samples_leaf" : [3],\
#                            "base_estimator__min_samples_split" : [2],\
#                            "base_estimator__max_features" : [None]},
#     'GradientBoostingClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
#                                    'min_samples_split':[2],'min_samples_leaf':[3],\
#                                    'learning_rate':[0.1],'subsample':[1.0]},
#     'SVC': [{'kernel':['rbf'],'gamma':[1e-1],'C':[1]},\
#             {'kernel':['linear'],'C':[1,10]}]
}

In [8]:
class Model_Selection:
    
    def __init__(self,models,model_grid_params,stock_5,latest_day,pred_day,day):
        
        self.models = models
        self.model_grid = model_grid_params
        self.stock_5 = stock_5
        self.latest_day= latest_day
        self.pred_day = pred_day
        self.day = day
        self.keys = models.keys()
        self.best_score = {}
        self.grid = {}
        
        self.predict_values = {}
        self.predict_proba = {}
        self.cv_acc = {}
        self.acc = {}
        self.fscore = {}
        self.true_values = {}
        
        self.predict_values_day = {}
        self.predict_proba_day = {}
        self.cv_acc_day = {}
        self.acc_day = {}
        self.fscore_day = {}
        self.true_values_day = {}
        self.summary_day = []
        
    def Grid_fit(self,X_train,y_train,cv = 5,scoring = 'accuracy'):
        
        for key in self.keys:
            print ("Running GridSearchCV for %s" %(key))
            model = self.models[key]
            model_grid = self.model_grid[key]
            Grid = GridSearchCV(model, model_grid, cv = cv, scoring = scoring)
            Grid.fit(X_train,y_train) 
            self.grid[key] = Grid
            print (Grid.best_params_)
            print ('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc[key].append(Grid.best_score_)  
    
    def model_fit(self,X_train, y_train, X_test, y_test):
        
        for key in self.keys:
            print ("Running training & testing for %s." %(key))
            model = self.models[key]
            model.set_params(**self.grid[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            proba = model.predict_proba(X_test)[:,0]
            #print 'Prediction latest 15 second = %s'%(predictions)
            self.predict_values[key].append(predictions.tolist())
            self.predict_proba[key].append(proba.tolist())
            self.true_values[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test,predictions)
            f_score = metrics.f1_score(y_test,predictions)
            print ('Accuracy = %s'%(acc))
            self.acc[key].append(acc)
            self.fscore[key].append(f_score)
            
#             if key == 'SVC':
#                 if self.grid[key].best_params_.values()[0] == 'linear':
#                     feature_imp = dict(zip([i for i in range(0,64,1)],model.coef_[0]))
#                     Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
#                     #print 'Kernel is linear and top five importance features = %s'%(Top_five)
#                 else:
#                     #print 'Kernel is rbf'
#                     pass
#             else: 
#                 feature_imp = dict(zip([i for i in range(0,64,1)],model.feature_importances_))
#                 Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
#                 #print 'Top five importance features = %s'%(Top_five)
#                 pass

    def pipline(self):
        
        self.set_list_day() # store day values
        for day in range(0,self.day,1):
            self.set_list() # store values
            print ('Day = %s'%(day+1))
            for i in range(0,500,self.pred_day):#9000-self.latest_day-600,self.pred_day):
                
                print ('--------------------Rolling Window Time = %s--------------------'%(i/pred_day))
                # Train data
                data_train = self.stock_5[day][i:i+self.latest_day]
                X_train = data_train[:,:-3]
                train_rise = data_train[:,-3]
                train_fall = data_train[:,-2]
                train_noise = data_train[:,-1]
                y_train = train_rise

                # Test data
                data_test = self.stock_5[day][i + self.latest_day:i + self.latest_day + self.pred_day]
                X_test = data_test[:,:-3]
                test_rise = data_test[:,-3]
                test_fall = data_test[:,-2]
                test_noise = data_test[:,-1]
                y_test = test_rise
                
                #start = time.time()
                self.Grid_fit(X_train, y_train, cv = 2, scoring = 'accuracy')
                self.model_fit(X_train, y_train,X_test,y_test)
                
                
                #end = time.time()
                #print 'Total Time = %s'%(end - start)
                
            for key in self.keys:
                
                self.cv_acc_day[key].append(self.cv_acc[key])
                self.acc_day[key].append(self.acc[key])
                self.fscore_day[key].append(self.fscore[key])
                self.true_values_day[key].append(self.true_values[key])
                self.predict_values_day[key].append(self.predict_values[key])
                self.predict_proba_day[key].append(self.predict_proba[key])
                
            self.summary_day.append(self.score_summary(sort_by = 'Accuracy_mean'))
    
    def set_list(self):
        
        for key in self.keys:
            self.predict_values[key] = []
            self.cv_acc[key] = []
            self.acc[key] = []
            self.fscore[key] = []
            self.true_values[key] = []
            
    def set_list_day(self):
        
        for key in self.keys:
            self.predict_values_day[key] = []
            self.cv_acc_day[key] = []
            self.acc_day[key] = []
            self.fscore_day[key] = []
            self.true_values_day[key] = []
            
    def score_summary(self,sort_by):
        
        summary = pd.concat([pd.Series(list(self.acc.keys())),pd.Series(map(lambda x: sum(self.acc[x])/len(self.acc[x]), self.acc)),\
                             pd.Series(list(map(lambda x: statistics.stdev(self.acc[x]), self.acc))),\
                             pd.Series(list(map(lambda x: max(self.acc[x]), self.acc))),\
                             pd.Series(list(map(lambda x: min(self.acc[x]), self.acc))),\
                             pd.Series(list(map(lambda x: sum(self.fscore[x])/len(self.fscore[x]), self.fscore)))],axis=1)
        summary.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary.index.rename('Ranking', inplace=True)
        return summary.sort_values(by = [sort_by], ascending=False)
          
    def print_(self):

        print (self.predict_values)

In [9]:
class Model_Selection_rf:
    
    def __init__(self,models,model_grid_params,stock_5,trf_5,latest_day,pred_day,day):
        
        self.models = models
        self.model_grid = model_grid_params
        self.stock_5 = stock_5
        self.trf_5 = trf_5
        self.latest_day= latest_day
        self.pred_day = pred_day
        self.day = day
        self.keys = models.keys()
    
        
        #rise
        self.best_score_r = {}
        self.grid_r = {}
        
        self.predict_values_r = {}
        self.predict_proba_r = {}
        self.cv_acc_r = {}
        self.acc_r = {}
        self.fscore_r = {}
        self.true_values_r = {}
        
        self.predict_values_day_r = {}
        self.predict_proba_day_r = {}
        self.cv_acc_day_r = {}
        self.acc_day_r = {}
        self.fscore_day_r = {}
        self.true_values_day_r = {}
        self.summary_day_r = []
        
        #fall
        self.best_score_f = {}
        self.grid_f = {}
        
        self.predict_values_f = {}
        self.predict_proba_f = {}
        self.cv_acc_f = {}
        self.acc_f = {}
        self.fscore_f = {}
        self.true_values_f = {}
        
        self.predict_values_day_f = {}
        self.predict_proba_day_f = {}
        self.cv_acc_day_f = {}
        self.acc_day_f = {}
        self.fscore_day_f = {}
        self.true_values_day_f = {}
        self.summary_day_f = []
        
        #total
        self.acc ={}
        self.fscore = {}
        
        self.acc_day ={}
        self.fscore_day = {}
        
        self.summary_day = []
        
    def Grid_fit_r(self,X_train,y_train,cv = 5,scoring = 'accuracy'):
        
        for key in self.keys:
            print ("Running GridSearchCV for %s" %(key))
            model = self.models[key]
            model_grid = self.model_grid[key]
            Grid = GridSearchCV(model, model_grid, cv = cv, scoring = scoring)
            Grid.fit(X_train,y_train) 
            self.grid_r[key] = Grid
            print (Grid.best_params_)
            print ('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc_r[key].append(Grid.best_score_)  
            
    def Grid_fit_f(self,X_train,y_train,cv = 5,scoring = 'accuracy'):
        
        for key in self.keys:
            print ("Running GridSearchCV for %s" %(key))
            model = self.models[key]
            model_grid = self.model_grid[key]
            Grid = GridSearchCV(model, model_grid, cv = cv, scoring = scoring)
            Grid.fit(X_train,y_train) 
            self.grid_f[key] = Grid
            print (Grid.best_params_)
            print ('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc_f[key].append(Grid.best_score_) 
    
    def model_fit_r(self,X_train, y_train, X_test, y_test):
        
        for key in self.keys:
            print ("Running training & testing for %s." %(key))
            model = self.models[key]
            model.set_params(**self.grid_r[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            proba = model.predict_proba(X_test)[:,0]
            #print 'Prediction latest 15 second = %s'%(predictions)
            self.predict_values_r[key].append(predictions.tolist())
            self.predict_proba_r[key].append(proba.tolist())
            
            self.true_values_r[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test,predictions)
            f_score = metrics.f1_score(y_test,predictions)
            print ('Accuracy = %s'%(acc))
            self.acc_r[key].append(acc)
            self.fscore_r[key].append(f_score)
        
    def model_fit_f(self,X_train, y_train, X_test, y_test):
        
        for key in self.keys:
            print ("Running training & testing for %s." %(key))
            model = self.models[key]
            model.set_params(**self.grid_f[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            proba = model.predict_proba(X_test)[:,0]
            #print 'Prediction latest 15 second = %s'%(predictions)
            self.predict_values_f[key].append(predictions.tolist())
            self.predict_proba_f[key].append(proba.tolist())
            self.true_values_f[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test,predictions)
            f_score = metrics.f1_score(y_test,predictions)
            print ('Accuracy = %s'%(acc))
            self.acc_f[key].append(acc)
            self.fscore_f[key].append(f_score)
            
    def model_fit(self,proba_r,proba_f,y_test):
        
        for key in self.keys:
            pr = np.array(proba_r[key])
            pf = np.array(proba_f[key])
            prob = np.sign(pr[0]-pf[0])
            acc = metrics.accuracy_score(y_test,prob)
#             f_score = metrics.f1_score(y_test,predictions)
            print ('Accuracy = %s'%(acc))
            self.acc[key].append(acc)
#             self.fscore[key].append(f_score)
            
    def pipline(self):
        
        self.set_list_day() # store day values
        for day in range(0,self.day,1):
            self.set_list() # store values
            print ('Day = %s'%(day+1))
            for i in range(0,500,self.pred_day):#9000-self.latest_day-600,self.pred_day):
                
                print ('--------------------Rolling Window Time = %s--------------------'%(i/pred_day))
                # Train data
                data_train = self.stock_5[day][i:i+self.latest_day]
                X_train = data_train[:,:-3]
                train_rise = data_train[:,-3]
                train_fall = data_train[:,-2]
                train_noise = data_train[:,-1]
                

                # Test data
                data_test = self.stock_5[day][i + self.latest_day:i + self.latest_day + self.pred_day]
                X_test = data_test[:,:-3]
                test_rise = data_test[:,-3]
                test_fall = data_test[:,-2]
                test_noise = data_test[:,-1]
                
                y_test = self.trf_5[day][i + self.latest_day:i + self.latest_day + self.pred_day] 
                
                #start = time.time()
                self.Grid_fit_r(X_train, train_rise, cv = 2, scoring = 'accuracy')
                self.model_fit_r(X_train, train_rise,X_test,test_rise)
                
                self.Grid_fit_f(X_train, train_fall, cv = 2, scoring = 'accuracy')
                self.model_fit_f(X_train, train_fall,X_test,test_fall)
                
                self.model_fit(self.predict_proba_r,self.predict_proba_f,y_test)
                #end = time.time()
                #print 'Total Time = %s'%(end - start)
                
            for key in self.keys:
                
                self.cv_acc_day_r[key].append(self.cv_acc_r[key])
                self.acc_day_r[key].append(self.acc_r[key])
                self.fscore_day_r[key].append(self.fscore_r[key])
                self.true_values_day_r[key].append(self.true_values_r[key])
                self.predict_values_day_r[key].append(self.predict_values_r[key])
                self.predict_proba_day_r[key].append(self.predict_proba_r[key])
                
                self.cv_acc_day_f[key].append(self.cv_acc_f[key])
                self.acc_day_f[key].append(self.acc_f[key])
                self.fscore_day_f[key].append(self.fscore_f[key])
                self.true_values_day_f[key].append(self.true_values_f[key])
                self.predict_values_day_f[key].append(self.predict_values_f[key])
                self.predict_proba_day_f[key].append(self.predict_proba_f[key])
                
                self.acc_day[key].append(self.acc[key])
#                 self.fscore_day[key].append(self.f_score[key])
                
            self.summary_day_r.append(self.score_summary_r(sort_by = 'Accuracy_mean'))
            self.summary_day_f.append(self.score_summary_f(sort_by = 'Accuracy_mean'))
            self.summary_day.append(self.score_summary(sort_by = 'Accuracy_mean'))
    
    def set_list(self):
        
        for key in self.keys:
            self.predict_values_r[key] = []
            self.predict_proba_r[key] = []
            self.cv_acc_r[key] = []
            self.acc_r[key] = []
            self.fscore_r[key] = []
            self.true_values_r[key] = []
            
            self.predict_values_f[key] = []
            self.predict_proba_f[key] = []
            self.cv_acc_f[key] = []
            self.acc_f[key] = []
            self.fscore_f[key] = []
            self.true_values_f[key] = []
            
            self.acc[key] = []
            self.fscore[key] = []
            
    def set_list_day(self):
        
        for key in self.keys:
            self.predict_values_day_r[key] = []
            self.predict_proba_day_r[key] = []
            self.cv_acc_day_r[key] = []
            self.acc_day_r[key] = []
            self.fscore_day_r[key] = []
            self.true_values_day_r[key] = []
            
            self.predict_values_day_f[key] = []
            self.predict_proba_day_f[key] = []
            self.cv_acc_day_f[key] = []
            self.acc_day_f[key] = []
            self.fscore_day_f[key] = []
            self.true_values_day_f[key] = []
            
            self.acc_day[key] = []
            self.fscore_day[key] = []
            
    def score_summary_r(self,sort_by):
        
        summary_r = pd.concat([pd.Series(list(self.acc_r.keys())),pd.Series(map(lambda x: sum(self.acc_r[x])/len(self.acc_r[x]), self.acc_r)),\
                             pd.Series(list(map(lambda x: statistics.stdev(self.acc_r[x]), self.acc_r))),\
                             pd.Series(list(map(lambda x: max(self.acc_r[x]), self.acc_r))),\
                             pd.Series(list(map(lambda x: min(self.acc_r[x]), self.acc_r))),\
                             pd.Series(list(map(lambda x: sum(self.fscore_r[x])/len(self.fscore_r[x]), self.fscore_r)))],axis=1)
        summary_r.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary_r.index.rename('Ranking', inplace=True)
        return summary_r.sort_values(by = [sort_by], ascending=False)
    
    def score_summary_f(self,sort_by):
        
        summary_f = pd.concat([pd.Series(list(self.acc_f.keys())),pd.Series(map(lambda x: sum(self.acc_f[x])/len(self.acc_f[x]), self.acc_f)),\
                             pd.Series(list(map(lambda x: statistics.stdev(self.acc_f[x]), self.acc_f))),\
                             pd.Series(list(map(lambda x: max(self.acc_f[x]), self.acc_f))),\
                             pd.Series(list(map(lambda x: min(self.acc_f[x]), self.acc_f))),\
                             pd.Series(list(map(lambda x: sum(self.fscore_f[x])/len(self.fscore_f[x]), self.fscore_f)))],axis=1)
        summary_f.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary_f.index.rename('Ranking', inplace=True)
        return summary_f.sort_values(by = [sort_by], ascending=False)
    
    def score_summary(self,sort_by):         
        summary = pd.concat([pd.Series(list(self.acc.keys())),pd.Series(map(lambda x: sum(self.acc[x])/len(self.acc[x]), self.acc)),\
                             pd.Series(list(map(lambda x: statistics.stdev(self.acc[x]), self.acc))),\
                             pd.Series(list(map(lambda x: max(self.acc[x]), self.acc))),\
                             pd.Series(list(map(lambda x: min(self.acc[x]), self.acc)))],axis = 1)
#                              pd.Series(list(map(lambda x: sum(self.fscore[x])/len(self.fscore[x]), self.fscore)))],axis=1)
                             
        summary.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min']
        summary.index.rename('Ranking', inplace=True)
        return summary.sort_values(by = [sort_by], ascending=False)
    
    def print_(self):

        print (self.predict_values)

In [10]:
def get_dataframe(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    #create null columns for the null data
    df_null = df_all.copy()
    for columnname in df_all.columns:
        df_null[columnname+'isnull'] = list(map(lambda x: int(x),df_all.isna()[columnname]))
    df_null.drop(df_all.columns,axis = 1,inplace = True)

    df_total = pd.concat([df_all,df_null],axis = 1)
    df_total = pd.concat([df_total,df_rf.loc[df_total.index]],axis = 1).loc[TXF.index[25]:TXF.index[-1],:].fillna(0)
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total.index]['trf']

    return [df_total ,trf]


In [11]:
def get_dataframe_drop(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    df_total = df_all.dropna(axis = 0,how = 'any')
    df_total = pd.concat([df_total,df_rf.loc[df_total.index]],axis = 1).dropna(axis = 0,how = 'any')

    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total.index]['trf']

    return [df_total,trf]


In [12]:
def get_dataframe_rf(df_list,df_r,target,interval,window,ca=4,day = 1):
    df_list_rate = []
    df_rf = get_rise_fall(target,interval).shift(-1*day+1)
    
    #concat the df in the df_list
    for index,df in enumerate(df_list):
        if(df_r[index]):
            df_rate = get_change_rate(df,window).iloc[:,df.shape[1]:]
        else:
            df_rate = get_backday(df,window).iloc[:,df.shape[1]:]
            
        df_av  = get_av_ca(stock=df_rate,rf_df=df_rf,ca=ca,window = window).iloc[:,:-3]
        df_colname = list(map(lambda x:  df_name[index]+'_'+x,df_av.columns))
        df_dum = pd.get_dummies(df_av,prefix=df_colname,columns=df_av.columns)
        df_list_rate.append(df_dum)
        
    df_all = pd.concat(df_list_rate,axis =1)
    df_all['mean_rise_fall'] = mrf
    
    df_total = df_all.dropna(axis = 0,how = 'any')
    
    #split rise and fall
    dicu = {}
    dicd = {}
    for index, col in enumerate(df_total.columns[:-1]):
        dicu[col] = col+'_r'
        dicd[col] = col+'_f'

    up = df_total[df_total['mean_rise_fall']==1].rename(columns = dicu).iloc[:,:-1]
    dn = df_total[df_total['mean_rise_fall']==-1].rename(columns = dicd).iloc[:,:-1]
    df_total_ud = pd.concat([up,dn],axis = 1).sort_index().fillna(0)
    
    df_total_ud = pd.concat([df_total_ud,df_rf.loc[df_total_ud.index]],axis = 1).dropna(axis = 0,how = 'any')
    
    #get target rise fall data
    Y = Yunta50.copy()
    Y['trf'] = np.sign((Yunta50-Yunta50.shift(1))['close'].values)
    Y = Y.shift(-1*day+1)
    trf = Y.loc[df_total_ud.index]['trf']

    return [df_total_ud,trf]

In [13]:
def show_summary(summary):
    dic = {}
    df = pd.concat(summary,axis = 0,ignore_index = True)
    for index,i in enumerate(df.index):
        dic[i] = intervals[index//2]
    df_rn = df.rename(index = dic)
    df_rn.index.name = 'interval'
    return df_rn


# All Data with null column

In [338]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1
stock_interval = []
for interval in intervals:
    df_total,trf = get_dataframe(df_list,df_option,Yunta50_rate,interval,window)
    stock_interval.append(df_total.as_matrix())

In [339]:
latest_day = 500
pred_day = 50
day = len(intervals)
pip = Model_Selection(models,model_grid_params,stock_interval,latest_day,pred_day,day)

In [206]:
start = time.time()
pip.pipline()
end = time.time()
print ('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.594
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.608
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.62
Running training & testing for RandomForestClassifier.
Accuracy = 0.68
--------------------Rolling Window Time = 1.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.618
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.64
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.52
Running training & testing fo

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.646
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.648
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.54
Running training & testing for RandomForestClassifier.
Accuracy = 0.54
--------------------Rolling Window Time = 7.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.656
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.646
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.58
Running training & testing for RandomForestClassifier.
Accuracy = 0.54
--------------------Rolling Window Time = 8.0--------------------
Running Grid

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.714
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.71
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.76
Running training & testing for RandomForestClassifier.
Accuracy = 0.76
--------------------Rolling Window Time = 3.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.718
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.712
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.74
Running training & testing for RandomForestClassifier.
Accuracy = 0.7
--------------------Rolling Window Time = 4.0--------------------
Running GridSea

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.764
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.768
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.86
Running training & testing for RandomForestClassifier.
Accuracy = 0.86
--------------------Rolling Window Time = 9.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.776
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.778
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.82
Day = 6
--------------------Rolling Window Time = 0.0--------------------
Runn

{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.936
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.936
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.86
Running training & testing for RandomForestClassifier.
Accuracy = 0.86
--------------------Rolling Window Time = 5.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.922
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.924
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.96
Running training & testing for RandomForestClassifier.
Accuracy = 0.96
--------------------Rolling Window Time = 6.0--------------------
Running Gr

In [337]:
show_summary(pip.summary_day)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.614,0.09288,0.74,0.48,0.545992
0.0,DecisionTreeClassifier,0.61,0.092976,0.8,0.48,0.501637
0.1,RandomForestClassifier,0.628,0.087025,0.76,0.48,0.493786
0.1,DecisionTreeClassifier,0.598,0.085088,0.7,0.46,0.466151
0.25,RandomForestClassifier,0.682,0.078003,0.86,0.56,0.480753
0.25,DecisionTreeClassifier,0.68,0.065997,0.82,0.58,0.4791
0.5,RandomForestClassifier,0.732,0.064083,0.84,0.64,0.243056
0.5,DecisionTreeClassifier,0.724,0.066533,0.82,0.62,0.136878
0.75,RandomForestClassifier,0.792,0.0535,0.86,0.7,0.02
0.75,DecisionTreeClassifier,0.79,0.051854,0.86,0.7,0.0


# All data with rf

In [229]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1
stock_interval = []
for interval in intervals:
    df_total,trf = get_dataframe_rf(df_list,df_option,Yunta50_rate,interval,window)
    stock_interval.append(df_total.as_matrix())

In [230]:
latest_day = 500
pred_day = 50
day = len(intervals)
pip = Model_Selection(models,model_grid_params,stock_interval,latest_day,pred_day,day)

In [231]:
start = time.time()
pip.pipline()
end = time.time()
print ('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 5, 'min_samples_leaf': 10}
CV Best Score = 0.578
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.594
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.5
Running training & testing for RandomForestClassifier.
Accuracy = 0.64
--------------------Rolling Window Time = 1.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 5, 'min_samples_leaf': 2}
CV Best Score = 0.562
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.598
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.54
Running training & testing 

{'max_depth': None, 'min_samples_leaf': 5}
CV Best Score = 0.596
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.584
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.6
Running training & testing for RandomForestClassifier.
Accuracy = 0.68
--------------------Rolling Window Time = 7.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.602
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.608
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.56
Running training & testing for RandomForestClassifier.
Accuracy = 0.58
--------------------Rolling Window Time = 8.0--------------------
Running G

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.708
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.7
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.82
--------------------Rolling Window Time = 3.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.726
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.732
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.66
Running training & testing for RandomForestClassifier.
Accuracy = 0.7
--------------------Rolling Window Time = 4.0--------------------
Running GridSea

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.782
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.782
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.82
--------------------Rolling Window Time = 9.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.766
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.788
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.76
Running training & testing for RandomForestClassifier.
Accuracy = 0.76
Day = 6
--------------------Rolling Window Time = 0.0--------------------
Runni

{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.924
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.924
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.94
Running training & testing for RandomForestClassifier.
Accuracy = 0.94
--------------------Rolling Window Time = 5.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.922
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.898
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.9
Running training & testing for RandomForestClassifier.
Accuracy = 0.96
--------------------Rolling Window Time = 6.0--------------------
Runni

In [232]:
show_summary(pip.summary_day)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.566,0.057388,0.66,0.5,0.481818
0.0,DecisionTreeClassifier,0.536,0.032387,0.58,0.48,0.504065
0.1,RandomForestClassifier,0.588,0.073756,0.7,0.48,0.437619
0.1,DecisionTreeClassifier,0.554,0.081131,0.72,0.46,0.325903
0.25,RandomForestClassifier,0.658,0.063561,0.8,0.58,0.377855
0.25,DecisionTreeClassifier,0.636,0.071056,0.8,0.54,0.412996
0.5,RandomForestClassifier,0.73,0.041366,0.82,0.66,0.129324
0.5,DecisionTreeClassifier,0.71,0.054365,0.82,0.64,0.192814
0.75,DecisionTreeClassifier,0.784,0.04402,0.86,0.7,0.0
0.75,RandomForestClassifier,0.784,0.04402,0.86,0.7,0.0


# All data with drop null

In [212]:
df_list = [Yunta50,Taiwan50,orderbook,oil_price,bond,TXF,EXF,FXF,E_F,Nikkei,SP,Russell,DJ,SOX,VIX,pcr,FOI,IOI,DOI]
df_option = [True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,False,True,True,True]
df_name = ['Yunta50','Taiwan50','orderbook','oil_price','bond','TXF','EXF','FXF','E_F','Nikkei','SP','Russell','DJ','SOX','VIX','pcr','FOI','IOI','DOI']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1
stock_interval = []
for interval in intervals:
    df_total,trf = get_dataframe_drop(df_list,df_option,Yunta50_rate,interval,window)
    stock_interval.append(df_total.as_matrix())

In [213]:
latest_day = 500
pred_day = 50
day = len(intervals)
pip = Model_Selection(models,model_grid_params,stock_interval,latest_day,pred_day,day)

In [214]:
start = time.time()
pip.pipline()
end = time.time()
print ('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.6
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.636
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.56
Running training & testing for RandomForestClassifier.
Accuracy = 0.64
--------------------Rolling Window Time = 1.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.606
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.612
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.54
Running training & testing fo

{'max_depth': None, 'min_samples_leaf': 5}
CV Best Score = 0.604
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.59
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.52
Running training & testing for RandomForestClassifier.
Accuracy = 0.62
--------------------Rolling Window Time = 7.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.606
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.63
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.58
Running training & testing for RandomForestClassifier.
Accuracy = 0.56
--------------------Rolling Window Time = 8.0--------------------
Running 

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.724
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.71
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.84
--------------------Rolling Window Time = 3.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.73
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.736
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.62
Running training & testing for RandomForestClassifier.
Accuracy = 0.6
--------------------Rolling Window Time = 4.0--------------------
Running GridSea

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.782
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.774
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.82
--------------------Rolling Window Time = 9.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.788
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.786
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.76
Running training & testing for RandomForestClassifier.
Accuracy = 0.78
Day = 6
--------------------Rolling Window Time = 0.0--------------------
Runn

{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.924
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.924
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.94
Running training & testing for RandomForestClassifier.
Accuracy = 0.94
--------------------Rolling Window Time = 5.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 5}
CV Best Score = 0.844
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.912
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.88
Running training & testing for RandomForestClassifier.
Accuracy = 0.96
--------------------Rolling Window Time = 6.0--------------------
Running Gr

In [215]:
df_sum = show_summary(pip.summary_day)
df_sum

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.598,0.057697,0.68,0.52,0.52562
0.0,DecisionTreeClassifier,0.568,0.044422,0.68,0.54,0.492163
0.1,DecisionTreeClassifier,0.614,0.07306,0.78,0.52,0.464015
0.1,RandomForestClassifier,0.604,0.051467,0.7,0.54,0.464763
0.25,RandomForestClassifier,0.682,0.075689,0.86,0.62,0.478405
0.25,DecisionTreeClassifier,0.656,0.056411,0.78,0.58,0.431412
0.5,RandomForestClassifier,0.728,0.074952,0.84,0.6,0.192277
0.5,DecisionTreeClassifier,0.704,0.054813,0.82,0.62,0.048571
0.75,RandomForestClassifier,0.79,0.046428,0.86,0.7,0.0669
0.75,DecisionTreeClassifier,0.78,0.044222,0.86,0.7,0.025


##### rf data is little worse than normal and drop null data,and the other two almost the same

## Predict by American data

In [233]:
df_list = [SP,Russell,DJ,SOX,VIX]
df_option = [True,True,True,True,True]
df_name = ['SP','Russell','DJ','SOX','VIX']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1
stock_interval = []
for interval in intervals:
    df_total,trf = get_dataframe_drop(df_list,df_option,Yunta50_rate,interval,window)
    stock_interval.append(df_total.as_matrix())

In [234]:
latest_day = 500
pred_day = 50
day = len(intervals)
pip = Model_Selection(models,model_grid_params,stock_interval,latest_day,pred_day,day)

In [235]:
start = time.time()
pip.pipline()
end = time.time()
print ('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.594
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.62
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.64
Running training & testing for RandomForestClassifier.
Accuracy = 0.64
--------------------Rolling Window Time = 1.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.604
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.662
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.6
Running training & testing for

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.638
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.58
Running training & testing for RandomForestClassifier.
Accuracy = 0.58
--------------------Rolling Window Time = 7.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 5, 'min_samples_leaf': 2}
CV Best Score = 0.608
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.636
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.68
Running training & testing for RandomForestClassifier.
Accuracy = 0.72
--------------------Rolling Window Time = 8.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.614
Running Grid

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.732
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.66
Running training & testing for RandomForestClassifier.
Accuracy = 0.68
--------------------Rolling Window Time = 3.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.718
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.722
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.82
--------------------Rolling Window Time = 4.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.718
Running GridS

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.776
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.8
Running training & testing for RandomForestClassifier.
Accuracy = 0.8
--------------------Rolling Window Time = 9.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.782
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.776
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.82
Running training & testing for RandomForestClassifier.
Accuracy = 0.84
Day = 6
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.876
Running

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.936
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.84
Running training & testing for RandomForestClassifier.
Accuracy = 0.84
--------------------Rolling Window Time = 5.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.922
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.922
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.96
Running training & testing for RandomForestClassifier.
Accuracy = 0.96
--------------------Rolling Window Time = 6.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.92
Running GridSea

In [237]:
df_sum_am = show_summary(pip.summary_day)
df_sum_am 

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.614,0.09288,0.74,0.48,0.545992
0.0,DecisionTreeClassifier,0.61,0.092976,0.8,0.48,0.501637
0.1,RandomForestClassifier,0.628,0.087025,0.76,0.48,0.493786
0.1,DecisionTreeClassifier,0.598,0.085088,0.7,0.46,0.466151
0.25,RandomForestClassifier,0.682,0.078003,0.86,0.56,0.480753
0.25,DecisionTreeClassifier,0.68,0.065997,0.82,0.58,0.4791
0.5,RandomForestClassifier,0.732,0.064083,0.84,0.64,0.243056
0.5,DecisionTreeClassifier,0.724,0.066533,0.82,0.62,0.136878
0.75,RandomForestClassifier,0.792,0.0535,0.86,0.7,0.02
0.75,DecisionTreeClassifier,0.79,0.051854,0.86,0.7,0.0


#  Split the noise data model and the other

In [262]:
df_list = [SP,Russell,DJ,SOX,VIX]
df_option = [True,True,True,True,True]
df_name = ['SP','Russell','DJ','SOX','VIX']
intervals = [0,0.1,0.25,0.5,0.75,1,1.5]
window = 1
stock_interval = []
trf_interval = []
for interval in intervals:
    df_total,trf = get_dataframe_drop(df_list,df_option,Yunta50_rate,interval,window)
    stock_interval.append(df_total.as_matrix())
    trf_interval.append(trf.values)

In [330]:
latest_day = 500
pred_day = 50
day = len(intervals)
pip_rf = Model_Selection_rf(models,model_grid_params,stock_interval,trf_interval,latest_day,pred_day,day)

In [331]:
start = time.time()
pip_rf.pipline()
end = time.time()
print ('Total Time = %s'%(end-start))

Day = 1
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.594
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.62
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.64
Running training & testing for RandomForestClassifier.
Accuracy = 0.64
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.59
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.628
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.48
Running training & testing for RandomForestClassifier.
Accuracy = 0.58
Accuracy = 0.5
Accuracy = 

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.61
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.58
Running training & testing for RandomForestClassifier.
Accuracy = 0.56
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.594
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.644
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.54
Running training & testing for RandomForestClassifier.
Accuracy = 0.56
Accuracy = 0.42
Accuracy = 0.48
--------------------Rolling Window Time = 9.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.618
Running GridSearchCV for RandomForestClassifier


{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.604
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.64
Running training & testing for RandomForestClassifier.
Accuracy = 0.54
Accuracy = 0.5
Accuracy = 0.46
--------------------Rolling Window Time = 7.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 5, 'min_samples_leaf': 2}
CV Best Score = 0.608
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.636
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.68
Running training & testing for RandomForestClassifier.
Accuracy = 0.72
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.596
Running GridSearchCV for RandomForestClassi

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.688
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.6
Running training & testing for RandomForestClassifier.
Accuracy = 0.56
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.634
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.646
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.62
Running training & testing for RandomForestClassifier.
Accuracy = 0.62
Accuracy = 0.5
Accuracy = 0.44
--------------------Rolling Window Time = 6.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.686
Running GridSearchCV for RandomForestClassifier
{

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.726
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.52
Running training & testing for RandomForestClassifier.
Accuracy = 0.52
Accuracy = 0.44
Accuracy = 0.44
--------------------Rolling Window Time = 4.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.718
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.726
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.62
Running training & testing for RandomForestClassifier.
Accuracy = 0.64
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.72
Running GridSearchCV for RandomForestClassifier


{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.81
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.74
Running training & testing for RandomForestClassifier.
Accuracy = 0.74
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.828
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.828
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.76
Running training & testing for RandomForestClassifier.
Accuracy = 0.76
Accuracy = 0.5
Accuracy = 0.5
--------------------Rolling Window Time = 3.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.798
Running GridSearchCV for RandomForestClassifier
{'c

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.894
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.8
Running training & testing for RandomForestClassifier.
Accuracy = 0.8
Accuracy = 0.42
Accuracy = 0.36
--------------------Rolling Window Time = 1.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.87
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.87
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.88
Running training & testing for RandomForestClassifier.
Accuracy = 0.88
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.882
Running GridSearchCV for RandomForestClassifier
{'c

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.836
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.9
Running training & testing for RandomForestClassifier.
Accuracy = 0.9
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.846
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.828
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.92
Running training & testing for RandomForestClassifier.
Accuracy = 0.92
Accuracy = 0.38
Accuracy = 0.44
Day = 7
--------------------Rolling Window Time = 0.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.956
Running GridSearchCV for RandomFores

{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.916
Running training & testing for DecisionTreeClassifier.
Accuracy = 0.94
Running training & testing for RandomForestClassifier.
Accuracy = 0.94
Accuracy = 0.22
Accuracy = 0.46
--------------------Rolling Window Time = 8.0--------------------
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': 1, 'min_samples_leaf': 1}
CV Best Score = 0.912
Running GridSearchCV for RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 5}
CV Best Score = 0.912
Running training & testing for DecisionTreeClassifier.
Accuracy = 1.0
Running training & testing for RandomForestClassifier.
Accuracy = 1.0
Running GridSearchCV for DecisionTreeClassifier
{'max_depth': None, 'min_samples_leaf': 10}
CV Best Score = 0.916
Running GridSearchCV for RandomForestClassifie

In [344]:
pd.concat(pip_rf.summary_day,axis = 0)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min
Ranking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,RandomForestClassifier,0.478,0.042635,0.56,0.4
0,DecisionTreeClassifier,0.468,0.085479,0.64,0.34
1,RandomForestClassifier,0.474,0.080581,0.6,0.36
0,DecisionTreeClassifier,0.468,0.085479,0.64,0.34
0,DecisionTreeClassifier,0.498,0.064256,0.58,0.4
1,RandomForestClassifier,0.476,0.094657,0.6,0.26
0,DecisionTreeClassifier,0.498,0.064256,0.58,0.4
1,RandomForestClassifier,0.488,0.064773,0.58,0.4
1,RandomForestClassifier,0.492,0.054324,0.58,0.42
0,DecisionTreeClassifier,0.466,0.054201,0.52,0.38


In [351]:
show_summary(pip_rf.summary_day)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,RandomForestClassifier,0.478,0.042635,0.56,0.4
0.0,DecisionTreeClassifier,0.468,0.085479,0.64,0.34
0.1,RandomForestClassifier,0.474,0.080581,0.6,0.36
0.1,DecisionTreeClassifier,0.468,0.085479,0.64,0.34
0.25,DecisionTreeClassifier,0.498,0.064256,0.58,0.4
0.25,RandomForestClassifier,0.476,0.094657,0.6,0.26
0.5,DecisionTreeClassifier,0.498,0.064256,0.58,0.4
0.5,RandomForestClassifier,0.488,0.064773,0.58,0.4
0.75,RandomForestClassifier,0.492,0.054324,0.58,0.42
0.75,DecisionTreeClassifier,0.466,0.054201,0.52,0.38


In [349]:
show_summary(pip_rf.summary_day_r)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.614,0.09288,0.74,0.48,0.545992
0.0,DecisionTreeClassifier,0.61,0.092976,0.8,0.48,0.501637
0.1,RandomForestClassifier,0.628,0.087025,0.76,0.48,0.493786
0.1,DecisionTreeClassifier,0.598,0.085088,0.7,0.46,0.466151
0.25,RandomForestClassifier,0.682,0.078003,0.86,0.56,0.480753
0.25,DecisionTreeClassifier,0.68,0.065997,0.82,0.58,0.4791
0.5,RandomForestClassifier,0.732,0.064083,0.84,0.64,0.243056
0.5,DecisionTreeClassifier,0.724,0.066533,0.82,0.62,0.136878
0.75,RandomForestClassifier,0.792,0.0535,0.86,0.7,0.02
0.75,DecisionTreeClassifier,0.79,0.051854,0.86,0.7,0.0


In [348]:
show_summary(pip_rf.summary_day_f)

Unnamed: 0_level_0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,RandomForestClassifier,0.6,0.035277,0.64,0.54,0.506394
0.0,DecisionTreeClassifier,0.574,0.06802,0.68,0.48,0.534557
0.1,RandomForestClassifier,0.646,0.067363,0.76,0.54,0.507736
0.1,DecisionTreeClassifier,0.592,0.076129,0.72,0.48,0.463009
0.25,DecisionTreeClassifier,0.68,0.06532,0.78,0.6,0.428503
0.25,RandomForestClassifier,0.648,0.074952,0.76,0.56,0.372888
0.5,DecisionTreeClassifier,0.726,0.112368,0.88,0.52,0.0
0.5,RandomForestClassifier,0.718,0.10174,0.82,0.52,0.044848
0.75,DecisionTreeClassifier,0.786,0.105851,0.9,0.54,0.0
0.75,RandomForestClassifier,0.786,0.105851,0.9,0.54,0.028571
