In [1]:
import talib as ta

from datetime import datetime
import pandas as pd
import numpy as np
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

def get_score(model, x_train=0, x_test=0):
    if not x_train or not x_test:
        x_train = X_train
        x_test = X_test
    y_pred = model.predict(x_test)
    print('train: {}'.format(model.score(x_train, y_train) * 100))
    print('test: {}'.format(model.score(x_test, y_test) * 100))
    print('accuracy score: {}'.format(accuracy_score(y_test, y_pred) * 100))

params = {
    'n_neighbors': np.arange(1,10),
    'leaf_size' : np.arange(1,5),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}
#best_params = knn_cv.best_params_

In [2]:
df = pd.read_csv('data/data3_H1.csv', parse_dates=['Datetime'], index_col='Datetime')

In [3]:
X = df.drop('Target', axis=1).values
y = df.Target.values

X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.40, random_state=42)

half_split = int(len(X_tmp) / 2)
X_test = X_tmp[:half_split]
X_final = X_tmp[half_split:]
y_test = y_tmp[:half_split]
y_final = y_tmp[half_split:]

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)

In [5]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
get_score(knn)

train: 69.02255380003102
test: 50.04393446012302
accuracy score: 50.04393446012302


In [None]:
knn_cv = GridSearchCV(KNeighborsClassifier(), params, cv=3, verbose=True)
knn_cv.fit(X_train, y_train)
print("Tuned KNN Parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
best_params = knn_cv.best_params_

In [None]:
knn = KNeighborsClassifier(**best_params)
knn.fit(X_train, y_train)
get_score(knn)

In [None]:
ada_clf = AdaBoostClassifier(
    KNeighborsClassifier(**best_params),
    n_estimators=1000,
    algorithm="SAMME.R",
    learning_rate=0.01,
    
)
ada_clf.fit(X_train, y_train)
get_score(ada_clf)

In [None]:
bag_log = BaggingClassifier(
    KNeighborsClassifier(**best_params),
    n_estimators=1000,
    max_samples=500,
    bootstrap=True
)
bag_log.fit(X_train, y_train)
get_score(bag_log)