In [12]:
import talib as ta

from datetime import datetime
import pandas as pd
import numpy as np
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

def get_score(model, x_train=0, x_test=0):
    if not x_train or not x_test:
        x_train = X_train
        x_test = X_test
    y_pred = model.predict(x_test)
    print('train: {}'.format(model.score(x_train, y_train) * 100))
    print('test: {}'.format(model.score(x_test, y_test) * 100))
    print('accuracy score: {}'.format(accuracy_score(y_test, y_pred) * 100))

params_l1 = {
    'log__penalty': ['l1'],
    'log__C': np.logspace(-4, 4, 20),
    'log__tol': [0.00001, 0.0001, 0.001, 0.01, 0.10],
    'log__class_weight': [None, 'balanced'],
    'log__solver': ['saga'],
    'log__multi_class': ['ovr', 'multinomial', 'auto']
}
params_l2 = {
    'penalty': ['l2'],
    'C': np.logspace(-4, 4, 20),
    'tol': [0.00001, 0.0001, 0.001, 0.01, 0.10],
    'class_weight': [None, 'balanced'],
    'solver': ['newton-cg', 'lbfgs', 'sag'],
    'multi_class': ['ovr', 'multinomial', 'auto']
}

best_params = {'C': 1438.44988828766,
 'class_weight': 'balanced',
 'multi_class': 'multinomial',
 'penalty': 'l2',
 'solver': 'sag',
 'tol': 0.1}


In [2]:
df = pd.read_csv('data/data3_H1.csv', parse_dates=['Datetime'], index_col='Datetime')

In [3]:
X = df.drop('Target', axis=1).values
y = df.Target.values

X = StandardScaler().fit_transform(X)

X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.30, random_state=42)

half_split = int(len(X_tmp) / 2)
X_test = X_tmp[:half_split]
X_final = X_tmp[half_split:]
y_test = y_tmp[:half_split]
y_final = y_tmp[half_split:]


In [4]:
X.shape

(96733, 25)

In [5]:
X_test.shape

(14510, 25)

In [6]:
X_final.shape

(14510, 25)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)

In [9]:
log_cv2 = GridSearchCV(LogisticRegression(), params_l2, cv=3, verbose=True, n_jobs=-1)
log_cv2.fit(X_train, y_train)
print('best score: {}'.format(log_cv2.best_score_ * 100))

Fitting 3 folds for each of 1800 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 11.3min finished


best score: 52.67526176657362


In [10]:
log_cv2_best = log_cv2.best_estimator_
log_cv2_best.fit(X_train, y_train)
get_score(log_cv2)

train: 52.41386439826916
test: 51.8538938662991
accuracy score: 51.8538938662991


In [11]:
log_cv2.best_params_

{'C': 1438.44988828766,
 'class_weight': 'balanced',
 'multi_class': 'multinomial',
 'penalty': 'l2',
 'solver': 'sag',
 'tol': 0.1}

In [14]:
ada_clf = AdaBoostClassifier(
    LogisticRegression(**best_params),
    n_estimators=1000,
    algorithm="SAMME.R",
    learning_rate=0.01
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=1438.44988828766, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=None, solver='sag', tol=0.1, verbose=0,
          warm_start=False),
          learning_rate=0.01, n_estimators=1000, random_state=None)

In [16]:
get_score(ada_clf)

train: 52.31344055055898
test: 52.0882150241213
accuracy score: 52.0882150241213


In [15]:
#bagging from Best Params
bag_log = BaggingClassifier(
    LogisticRegression(**best_params),
    n_estimators=1000,
    max_samples=500,
    bootstrap=True
)
bag_log.fit(X_train, y_train)
get_score(bag_log)

train: 52.645725340776515
test: 51.784975878704344
accuracy score: 51.784975878704344
