In [5]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

import numpy as np

from sklearn.preprocessing import StandardScaler

np.random.seed(7)

In [8]:
def validate_model(cls_name, model, X_train, y_train, X_test, y_test, period, pair, lag):
    info_format = '{3:.3f}: [period: {0}](lag: {1}): {2} | {4}'
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(info_format.format(period, lag, cls_name, score, pair))
    return score

def test_cls(period, pair, lag):
    df = pd.read_csv('../datasets/{}/{}.csv'.format(period, pair), index_col=0)
    close_lag_fields = ['close-{}'.format(n+1) for n in range(lag)]
    volume_lag_fields = ['volume-{}'.format(n+1) for n in range(lag)]
    for n in range(1, lag+1):
        df['close-' + str(n)] = df['close'] - df.shift(n)['close']
        df['volume-' + str(n)] = df['volume'] - df.shift(n)['volume']
    df['up'] = df['close'] < df.shift(-1)['close']
    df.dropna(inplace=True)
    X = df[close_lag_fields + volume_lag_fields].values
    y = df['up'].astype(int).values

    scaler = StandardScaler()

    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
    
    best_score = 0.0
    
    model = RandomForestClassifier(n_jobs=4, random_state=0, n_estimators=20)
    score = validate_model('Random forest', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score
    
    model = SGDClassifier(loss="hinge", penalty="l2")
    score = validate_model('SGD', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score
    
    model = LogisticRegression()
    score = validate_model('Logistic regression', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score
    
    model = GradientBoostingClassifier(max_depth=4)
    score = validate_model('Gradient boosting', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score
    
    model = MLPClassifier()
    score = validate_model('MLP', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score
    
    model = DecisionTreeClassifier()
    score = validate_model('Decision tree', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score

    model = KNeighborsClassifier()
    score = validate_model('KNeighbours', model, X_train, y_train, X_test, y_test, period, pair, lag)
    if score > best_score:
        best_score = score

    print()
    return best_score


In [9]:
best_score = 0.0

for period in ['5min', '30min', 'day']:
    for lag in range(2, 37):
        for pair in ['BTC_ETC', 'BTC_LTC']:
            score = test_cls(period, pair, lag)
            if score > best_score:
                best_score = score
            print('Best score: {:.3f}'.format(best_score))
            print()

0.542: [period: 5min](lag: 2): Random forest | BTC_ETC
0.547: [period: 5min](lag: 2): SGD | BTC_ETC
0.548: [period: 5min](lag: 2): Logistic regression | BTC_ETC
0.571: [period: 5min](lag: 2): Gradient boosting | BTC_ETC
0.568: [period: 5min](lag: 2): MLP | BTC_ETC
0.514: [period: 5min](lag: 2): Decision tree | BTC_ETC
0.536: [period: 5min](lag: 2): KNeighbours | BTC_ETC

Best score: 0.571

0.555: [period: 5min](lag: 2): Random forest | BTC_LTC
0.535: [period: 5min](lag: 2): SGD | BTC_LTC
0.536: [period: 5min](lag: 2): Logistic regression | BTC_LTC
0.577: [period: 5min](lag: 2): Gradient boosting | BTC_LTC
0.579: [period: 5min](lag: 2): MLP | BTC_LTC
0.524: [period: 5min](lag: 2): Decision tree | BTC_LTC
0.547: [period: 5min](lag: 2): KNeighbours | BTC_LTC

Best score: 0.579

0.545: [period: 5min](lag: 3): Random forest | BTC_ETC
0.542: [period: 5min](lag: 3): SGD | BTC_ETC
0.544: [period: 5min](lag: 3): Logistic regression | BTC_ETC
0.572: [period: 5min](lag: 3): Gradient boosting | BT

0.538: [period: 5min](lag: 12): KNeighbours | BTC_ETC

Best score: 0.589

0.563: [period: 5min](lag: 12): Random forest | BTC_LTC
0.537: [period: 5min](lag: 12): SGD | BTC_LTC
0.536: [period: 5min](lag: 12): Logistic regression | BTC_LTC
0.584: [period: 5min](lag: 12): Gradient boosting | BTC_LTC
0.576: [period: 5min](lag: 12): MLP | BTC_LTC
0.530: [period: 5min](lag: 12): Decision tree | BTC_LTC
0.551: [period: 5min](lag: 12): KNeighbours | BTC_LTC

Best score: 0.589

0.558: [period: 5min](lag: 13): Random forest | BTC_ETC
0.556: [period: 5min](lag: 13): SGD | BTC_ETC
0.552: [period: 5min](lag: 13): Logistic regression | BTC_ETC
0.578: [period: 5min](lag: 13): Gradient boosting | BTC_ETC
0.572: [period: 5min](lag: 13): MLP | BTC_ETC
0.517: [period: 5min](lag: 13): Decision tree | BTC_ETC
0.532: [period: 5min](lag: 13): KNeighbours | BTC_ETC

Best score: 0.589

0.569: [period: 5min](lag: 13): Random forest | BTC_LTC
0.520: [period: 5min](lag: 13): SGD | BTC_LTC
0.541: [period: 5min](la

0.588: [period: 5min](lag: 22): Gradient boosting | BTC_LTC
0.581: [period: 5min](lag: 22): MLP | BTC_LTC
0.532: [period: 5min](lag: 22): Decision tree | BTC_LTC
0.556: [period: 5min](lag: 22): KNeighbours | BTC_LTC

Best score: 0.593

0.553: [period: 5min](lag: 23): Random forest | BTC_ETC
0.535: [period: 5min](lag: 23): SGD | BTC_ETC
0.550: [period: 5min](lag: 23): Logistic regression | BTC_ETC
0.580: [period: 5min](lag: 23): Gradient boosting | BTC_ETC
0.564: [period: 5min](lag: 23): MLP | BTC_ETC
0.518: [period: 5min](lag: 23): Decision tree | BTC_ETC
0.539: [period: 5min](lag: 23): KNeighbours | BTC_ETC

Best score: 0.593

0.568: [period: 5min](lag: 23): Random forest | BTC_LTC
0.545: [period: 5min](lag: 23): SGD | BTC_LTC
0.538: [period: 5min](lag: 23): Logistic regression | BTC_LTC
0.590: [period: 5min](lag: 23): Gradient boosting | BTC_LTC
0.582: [period: 5min](lag: 23): MLP | BTC_LTC
0.528: [period: 5min](lag: 23): Decision tree | BTC_LTC
0.561: [period: 5min](lag: 23): KNeigh

0.550: [period: 5min](lag: 33): Random forest | BTC_ETC
0.530: [period: 5min](lag: 33): SGD | BTC_ETC
0.546: [period: 5min](lag: 33): Logistic regression | BTC_ETC
0.573: [period: 5min](lag: 33): Gradient boosting | BTC_ETC
0.560: [period: 5min](lag: 33): MLP | BTC_ETC
0.522: [period: 5min](lag: 33): Decision tree | BTC_ETC
0.534: [period: 5min](lag: 33): KNeighbours | BTC_ETC

Best score: 0.593

0.574: [period: 5min](lag: 33): Random forest | BTC_LTC
0.537: [period: 5min](lag: 33): SGD | BTC_LTC
0.544: [period: 5min](lag: 33): Logistic regression | BTC_LTC
0.591: [period: 5min](lag: 33): Gradient boosting | BTC_LTC
0.586: [period: 5min](lag: 33): MLP | BTC_LTC
0.533: [period: 5min](lag: 33): Decision tree | BTC_LTC
0.560: [period: 5min](lag: 33): KNeighbours | BTC_LTC

Best score: 0.593

0.551: [period: 5min](lag: 34): Random forest | BTC_ETC
0.512: [period: 5min](lag: 34): SGD | BTC_ETC
0.548: [period: 5min](lag: 34): Logistic regression | BTC_ETC
0.578: [period: 5min](lag: 34): Grad

0.556: [period: 30min](lag: 8): MLP | BTC_ETC
0.510: [period: 30min](lag: 8): Decision tree | BTC_ETC
0.523: [period: 30min](lag: 8): KNeighbours | BTC_ETC

Best score: 0.593

0.535: [period: 30min](lag: 8): Random forest | BTC_LTC
0.528: [period: 30min](lag: 8): SGD | BTC_LTC
0.529: [period: 30min](lag: 8): Logistic regression | BTC_LTC
0.565: [period: 30min](lag: 8): Gradient boosting | BTC_LTC
0.562: [period: 30min](lag: 8): MLP | BTC_LTC
0.524: [period: 30min](lag: 8): Decision tree | BTC_LTC
0.538: [period: 30min](lag: 8): KNeighbours | BTC_LTC

Best score: 0.593

0.528: [period: 30min](lag: 9): Random forest | BTC_ETC
0.526: [period: 30min](lag: 9): SGD | BTC_ETC
0.541: [period: 30min](lag: 9): Logistic regression | BTC_ETC
0.526: [period: 30min](lag: 9): Gradient boosting | BTC_ETC
0.542: [period: 30min](lag: 9): MLP | BTC_ETC
0.510: [period: 30min](lag: 9): Decision tree | BTC_ETC
0.520: [period: 30min](lag: 9): KNeighbours | BTC_ETC

Best score: 0.593

0.528: [period: 30min](l

0.512: [period: 30min](lag: 18): KNeighbours | BTC_ETC

Best score: 0.593

0.554: [period: 30min](lag: 18): Random forest | BTC_LTC
0.516: [period: 30min](lag: 18): SGD | BTC_LTC
0.538: [period: 30min](lag: 18): Logistic regression | BTC_LTC
0.580: [period: 30min](lag: 18): Gradient boosting | BTC_LTC
0.570: [period: 30min](lag: 18): MLP | BTC_LTC
0.536: [period: 30min](lag: 18): Decision tree | BTC_LTC
0.545: [period: 30min](lag: 18): KNeighbours | BTC_LTC

Best score: 0.593

0.530: [period: 30min](lag: 19): Random forest | BTC_ETC
0.518: [period: 30min](lag: 19): SGD | BTC_ETC
0.545: [period: 30min](lag: 19): Logistic regression | BTC_ETC
0.535: [period: 30min](lag: 19): Gradient boosting | BTC_ETC
0.522: [period: 30min](lag: 19): MLP | BTC_ETC
0.502: [period: 30min](lag: 19): Decision tree | BTC_ETC
0.530: [period: 30min](lag: 19): KNeighbours | BTC_ETC

Best score: 0.593

0.563: [period: 30min](lag: 19): Random forest | BTC_LTC
0.532: [period: 30min](lag: 19): SGD | BTC_LTC
0.527: 

0.540: [period: 30min](lag: 28): Random forest | BTC_LTC
0.549: [period: 30min](lag: 28): SGD | BTC_LTC
0.527: [period: 30min](lag: 28): Logistic regression | BTC_LTC
0.570: [period: 30min](lag: 28): Gradient boosting | BTC_LTC
0.556: [period: 30min](lag: 28): MLP | BTC_LTC
0.518: [period: 30min](lag: 28): Decision tree | BTC_LTC
0.524: [period: 30min](lag: 28): KNeighbours | BTC_LTC

Best score: 0.593

0.511: [period: 30min](lag: 29): Random forest | BTC_ETC
0.506: [period: 30min](lag: 29): SGD | BTC_ETC
0.523: [period: 30min](lag: 29): Logistic regression | BTC_ETC
0.535: [period: 30min](lag: 29): Gradient boosting | BTC_ETC
0.521: [period: 30min](lag: 29): MLP | BTC_ETC
0.512: [period: 30min](lag: 29): Decision tree | BTC_ETC
0.513: [period: 30min](lag: 29): KNeighbours | BTC_ETC

Best score: 0.593

0.544: [period: 30min](lag: 29): Random forest | BTC_LTC
0.456: [period: 30min](lag: 29): SGD | BTC_LTC
0.544: [period: 30min](lag: 29): Logistic regression | BTC_LTC
0.568: [period: 30m

0.671: [period: day](lag: 3): MLP | BTC_LTC
0.575: [period: day](lag: 3): Decision tree | BTC_LTC
0.562: [period: day](lag: 3): KNeighbours | BTC_LTC

Best score: 0.671

0.603: [period: day](lag: 4): Random forest | BTC_ETC
0.479: [period: day](lag: 4): SGD | BTC_ETC
0.589: [period: day](lag: 4): Logistic regression | BTC_ETC
0.507: [period: day](lag: 4): Gradient boosting | BTC_ETC
0.562: [period: day](lag: 4): MLP | BTC_ETC
0.411: [period: day](lag: 4): Decision tree | BTC_ETC
0.479: [period: day](lag: 4): KNeighbours | BTC_ETC

Best score: 0.671

0.548: [period: day](lag: 4): Random forest | BTC_LTC
0.575: [period: day](lag: 4): SGD | BTC_LTC
0.562: [period: day](lag: 4): Logistic regression | BTC_LTC
0.548: [period: day](lag: 4): Gradient boosting | BTC_LTC
0.521: [period: day](lag: 4): MLP | BTC_LTC
0.548: [period: day](lag: 4): Decision tree | BTC_LTC
0.616: [period: day](lag: 4): KNeighbours | BTC_LTC

Best score: 0.671

0.514: [period: day](lag: 5): Random forest | BTC_ETC
0.62

0.479: [period: day](lag: 14): Random forest | BTC_LTC
0.507: [period: day](lag: 14): SGD | BTC_LTC
0.535: [period: day](lag: 14): Logistic regression | BTC_LTC
0.451: [period: day](lag: 14): Gradient boosting | BTC_LTC
0.521: [period: day](lag: 14): MLP | BTC_LTC
0.549: [period: day](lag: 14): Decision tree | BTC_LTC
0.549: [period: day](lag: 14): KNeighbours | BTC_LTC

Best score: 0.681

0.586: [period: day](lag: 15): Random forest | BTC_ETC
0.471: [period: day](lag: 15): SGD | BTC_ETC
0.643: [period: day](lag: 15): Logistic regression | BTC_ETC
0.571: [period: day](lag: 15): Gradient boosting | BTC_ETC
0.629: [period: day](lag: 15): MLP | BTC_ETC
0.500: [period: day](lag: 15): Decision tree | BTC_ETC
0.414: [period: day](lag: 15): KNeighbours | BTC_ETC

Best score: 0.681

0.529: [period: day](lag: 15): Random forest | BTC_LTC
0.557: [period: day](lag: 15): SGD | BTC_LTC
0.557: [period: day](lag: 15): Logistic regression | BTC_LTC
0.543: [period: day](lag: 15): Gradient boosting | BT

0.603: [period: day](lag: 25): Random forest | BTC_ETC
0.588: [period: day](lag: 25): SGD | BTC_ETC
0.632: [period: day](lag: 25): Logistic regression | BTC_ETC
0.632: [period: day](lag: 25): Gradient boosting | BTC_ETC
0.559: [period: day](lag: 25): MLP | BTC_ETC
0.441: [period: day](lag: 25): Decision tree | BTC_ETC
0.559: [period: day](lag: 25): KNeighbours | BTC_ETC

Best score: 0.681

0.515: [period: day](lag: 25): Random forest | BTC_LTC
0.382: [period: day](lag: 25): SGD | BTC_LTC
0.559: [period: day](lag: 25): Logistic regression | BTC_LTC
0.559: [period: day](lag: 25): Gradient boosting | BTC_LTC
0.500: [period: day](lag: 25): MLP | BTC_LTC
0.574: [period: day](lag: 25): Decision tree | BTC_LTC
0.515: [period: day](lag: 25): KNeighbours | BTC_LTC

Best score: 0.681

0.588: [period: day](lag: 26): Random forest | BTC_ETC
0.574: [period: day](lag: 26): SGD | BTC_ETC
0.603: [period: day](lag: 26): Logistic regression | BTC_ETC
0.603: [period: day](lag: 26): Gradient boosting | BT

0.424: [period: day](lag: 35): Random forest | BTC_LTC
0.591: [period: day](lag: 35): SGD | BTC_LTC
0.545: [period: day](lag: 35): Logistic regression | BTC_LTC
0.515: [period: day](lag: 35): Gradient boosting | BTC_LTC
0.455: [period: day](lag: 35): MLP | BTC_LTC
0.561: [period: day](lag: 35): Decision tree | BTC_LTC
0.439: [period: day](lag: 35): KNeighbours | BTC_LTC

Best score: 0.681

0.545: [period: day](lag: 36): Random forest | BTC_ETC
0.470: [period: day](lag: 36): SGD | BTC_ETC
0.515: [period: day](lag: 36): Logistic regression | BTC_ETC
0.485: [period: day](lag: 36): Gradient boosting | BTC_ETC
0.470: [period: day](lag: 36): MLP | BTC_ETC
0.515: [period: day](lag: 36): Decision tree | BTC_ETC
0.485: [period: day](lag: 36): KNeighbours | BTC_ETC

Best score: 0.681

0.515: [period: day](lag: 36): Random forest | BTC_LTC
0.500: [period: day](lag: 36): SGD | BTC_LTC
0.621: [period: day](lag: 36): Logistic regression | BTC_LTC
0.470: [period: day](lag: 36): Gradient boosting | BT