In [15]:
import pandas as pd
import numpy as pd
import sklearn.model_selection
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
%run Data_Engineering.ipynb

In [2]:
models = {
    'RandomForestClassifier' : RandomForestClassifier(random_state = 0),
    'ExtraTreesClassifier' : ExtraTreesClassifier(random_state = 0),
    'AdaBoostClassifier' : AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = 0),
    'GradientBoostingClassifier' : GradientBoostingClassifier(random_state = 0),
    'SVC' : SVC(probability=True, random_state = 0)
}

In [3]:
model_params = {
    'RandomForestClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                               'min_samples_split':[2],'criterion':['entropy'],\
                               'min_samples_leaf':[3]},
    'ExtraTreesClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                             'min_samples_split':[2],'criterion':['entropy'],\
                             'min_samples_leaf':[3]},
    'AdaBoostClassifier': {"base_estimator__criterion" : ["entropy"],\
                           "base_estimator__max_depth": [None],\
                           "base_estimator__min_samples_leaf" : [3],\
                           "base_estimator__min_samples_split" : [2],\
                           "base_estimator__max_features" : [None]},
    'GradientBoostingClassifier': {'max_features':[None],'n_estimators':[10],'max_depth':[10],\
                                   'min_samples_split':[2],'min_samples_leaf':[3],\
                                   'learning_rate':[0.1],'subsample':[1.0]},
    'SVC': [{'kernel':['rbf'],'gamma':[1e-1],'C':[1]},\
            {'kernel':['linear'],'C':[1, 10]}]
}

In [8]:
class Model_Selection:
    
    def __init__(self, models, model_params, data, latest_sec, pred_sec):
        self.models = models
        self.model_params = model_params
        self.latest_sec = latest_sec
        self.pred_sec = pred_sec
        self.data = data
        self.keys = models.keys()
        self.Grid = {}
        self.predict_values = {}
        self.cv_acc = {}
        self.acc = {}
        self.fscore = {}
        self.true_values = {}
        self.cv_acc_day = {}
        self.acc_day = {}
        self.fscore_day = {}
        self.true_values_day = {}
        self.predict_values_day = {}
        self.summary_day = []
        
             
    def set_list(self):
        
        for key in self.keys:
            self.predict_values[key] = []
            self.cv_acc[key] = []
            self.acc[key] = []
            self.fscore[key] = []
            self.true_values[key] = []
            
    def set_list_day(self):
        
        for key in self.keys:
            self.predict_values_day[key] = []
            self.cv_acc_day[key] = []
            self.acc_day[key] = []
            self.fscore_day[key] = []
            self.true_values_day[key] = []
        
        
    def grid_fit(self, X_train, y_train, cv = 5, scoring = 'accuracy'):
        
        for key in self.keys:
            print(f'Running grid search for {key}.')
            model = self.models[key]
            model_params = self.model_params[key]
            Grid = GridSearchCV(model,
                               model_params,
                                scoring = scoring,
                                cv = cv)
            Grid.fit(X_train, y_train)
            self.Grid[key] = Grid
            print(Grid.best_params_)
            print('CV Best Score = %s'%(Grid.best_score_))
            self.cv_acc[key].append(Grid.best_score_)  
            
    def model_fit(self, X_train, X_test, y_train, y_test):
        
        for key in self.keys:
            model = self.models[key]
            model.set_params(**self.Grid[key].best_params_)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            self.predict_values[key].append(predictions.tolist())
            self.true_values[key].append(y_test.tolist())
            acc = metrics.accuracy_score(y_test, predictions)
            f1 = metrics.f1_score(y_test, predictions)
            self.fscore[key].append(f1)
            self.acc[key].append(acc)
        
        if key == 'SVC':
            if self.Grid[key].best_params_['kernel'] == 'linear':
                feature_imp = dict(zip([i for i in range(0,84,1)],model.coef_[0]))
                Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
                print('Kernel is linear and top five importance features = %s'%(Top_five))
            else:
                print('Kernel is rbf')
                pass
        else: 
            feature_imp = dict(zip([i for i in range(0,84,1)],model.feature_importances_))
            Top_five = sorted(feature_imp.items(),key = lambda x : x[1] , reverse=True)[0:5]
            print('Top five importance features = %s'%(Top_five))
            pass
    
    def pipeline(self):
        self.set_list()
        self.set_list_day()
        for i in range(0,3232 - self.latest_sec - self.pred_sec, self.pred_sec):
            print('--------------------Rolling Window Time = %s--------------------'%(i/pred_sec))
            data_train = self.data[i:i + self.latest_sec]
            X_train = data_train.drop(0, axis = 1)
            y_train = data_train[0]
            
            data_test = self.data[i + self.latest_sec:i + self.latest_sec + self.pred_sec]
            X_test = data_test.drop(0, axis = 1)
            y_test = data_test[0]
            
            self.grid_fit(X_train, y_train, cv = 5, scoring = 'accuracy')
            self.model_fit(X_train, X_test, y_train, y_test)
            
            for key in self.keys:
                
                self.cv_acc_day[key].append(self.cv_acc[key])
                self.acc_day[key].append(self.acc[key])
                self.fscore_day[key].append(self.fscore[key])
                self.true_values_day[key].append(self.true_values[key])
                self.predict_values_day[key].append(self.predict_values[key])
            
            self.summary_day.append(self.score_summary(sort_by = 'Accuracy_mean'))
            
    def score_summary(self,sort_by):
        
        summary = pd.concat([pd.DataFrame(self.acc.keys()),pd.DataFrame(map(lambda x: np.mean(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: np.std(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: np.max(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: np.min(self.acc[x]), self.acc)),\
                             pd.DataFrame(map(lambda x: np.mean(self.fscore[x]), self.fscore))],axis=1)
        summary.columns = ['Estimator','Accuracy_mean','Accuracy_std','Accuracy_max','Accuracy_min','F_score']
        summary.index.rename('Ranking', inplace=True)
        return summary.sort_values(by = [sort_by], ascending=False)
          
    def print_(self):
        print(self.predict_values)

In [13]:
latest_sec = 60 * 30
pred_sec = 200
processed_data = setup()
pip = Model_Selection(models,model_params,processed_data,latest_sec,pred_sec)

In [14]:
pip.pipeline()
pip.summary_day[0].reset_index(drop = True)

--------------------Rolling Window Time = 0.0--------------------
Running grid search for RandomForestClassifier.
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.6838888888888889
Running grid search for ExtraTreesClassifier.
{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
CV Best Score = 0.6966666666666667
Running grid search for AdaBoostClassifier.
{'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': None, 'base_estimator__max_features': None, 'base_estimator__min_samples_leaf': 3, 'base_estimator__min_samples_split': 2}
CV Best Score = 0.7022222222222222
Running grid search for GradientBoostingClassifier.
{'learning_rate': 0.1, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10, 'subsample': 1.0}
CV Best Score = 0.711111111111111
Run

Unnamed: 0,Estimator,Accuracy_mean,Accuracy_std,Accuracy_max,Accuracy_min,F_score
0,ExtraTreesClassifier,0.63,0.0,0.63,0.63,0.773006
1,AdaBoostClassifier,0.62,0.0,0.62,0.62,0.763975
2,SVC,0.62,0.0,0.62,0.62,0.765432
3,RandomForestClassifier,0.615,0.0,0.615,0.615,0.76161
4,GradientBoostingClassifier,0.585,0.0,0.585,0.585,0.736508
