In [1]:
import talib as ta

from datetime import datetime
import pandas as pd
import numpy as np
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

def get_score(model, x_train=0, x_test=0):
    if not x_train or not x_test:
        x_train = X_train
        x_test = X_test
    y_pred = model.predict(x_test)
    print('train: {}'.format(model.score(x_train, y_train) * 100))
    print('test: {}'.format(model.score(x_test, y_test) * 100))
    print('accuracy score: {}'.format(accuracy_score(y_test, y_pred) * 100))

params_l1 = {
    'log__penalty': ['l1'],
    'log__C': np.logspace(-4, 4, 20),
    'log__tol': [0.00001, 0.0001, 0.001, 0.01, 0.10],
    'log__class_weight': [None, 'balanced'],
    'log__solver': ['saga'],
    'log__multi_class': ['ovr', 'multinomial', 'auto']
}
params_l2 = {
    'penalty': ['l2'],
    'C': np.logspace(-4, 4, 20),
    'tol': [0.00001, 0.0001, 0.001, 0.01, 0.10],
    'class_weight': [None, 'balanced'],
    'solver': ['newton-cg', 'lbfgs', 'sag'],
    'multi_class': ['ovr', 'multinomial', 'auto']
}
best_params = {'C': 3792.690190732246,
 'class_weight': None,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': 1e-05}

In [2]:
df = pd.read_csv('data/data3_H1.csv', parse_dates=['Datetime'], index_col='Datetime')

In [3]:
X = df.drop('Target', axis=1).values
y = df.Target.values

X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.30, random_state=42)

half_split = int(len(X_tmp) / 2)
X_test = X_tmp[:half_split]
X_final = X_tmp[half_split:]
y_test = y_tmp[:half_split]
y_final = y_tmp[half_split:]

In [4]:
minMaxScaler = MinMaxScaler()
X_train = minMaxScaler.fit_transform(X_train)
X_test = minMaxScaler.transform(X_test)
X_final = minMaxScaler.transform(X_final)

In [5]:
log = LogisticRegression(**best_params)
log.fit(X_train, y_train)



LogisticRegression(C=3792.690190732246, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=1e-05, verbose=0, warm_start=False)

In [6]:
get_score(log)

train: 52.936659134878084
test: 52.03997243280496
accuracy score: 52.03997243280496


In [11]:
log_cv2 = GridSearchCV(LogisticRegression(), params_l2, cv=3, verbose=True, n_jobs=-1)
log_cv2.fit(X_train, y_train)
print('best score: {}'.format(log_cv2.best_score_ * 100))

Fitting 3 folds for each of 1800 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 18.0min finished


best score: 52.61914255755911




In [13]:
log_cv2.best_params_

{'C': 3792.690190732246,
 'class_weight': None,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': 1e-05}

In [7]:
ada_clf = AdaBoostClassifier(
    LogisticRegression(**best_params),
    n_estimators=1000,
    algorithm="SAMME.R",
    learning_rate=0.01
)
ada_clf.fit(X_train, y_train)



NameError: name 'ads_clf' is not defined

In [8]:
get_score(ada_clf)

train: 52.70332137108088
test: 51.977946243969676
accuracy score: 51.977946243969676


In [9]:
bag_log = BaggingClassifier(
    LogisticRegression(**best_params),
    n_estimators=1000,
    max_samples=500,
    bootstrap=True,
    verbose=True,
    n_jobs=-1
)
bag_log.fit(X_train, y_train)
get_score(bag_log)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:  5.8min remaining: 17.4min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  5.8min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    1.9s remaining:    5.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:   16.0s remaining:   47.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:   16.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


train: 52.784546542022944


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    1.9s remaining:    5.7s


test: 51.96416264645073
accuracy score: 51.96416264645073


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
