In [13]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

import numpy as np

from sklearn.preprocessing import StandardScaler

np.random.seed(7)

In [35]:
def validate_model(cls_name, model, X_train, y_train, X_test, y_test, period, pair, lag):
    info_format = '{3:.3f}: [period: {0}](lag: {1}): {2} | {4}'
    
    model.fit(X_train, y_train)
    print(info_format.format(period, lag, cls_name, model.score(X_test, y_test), pair))

def test_cls(period, pair, lag):
    df = pd.read_csv('../datasets/{}/{}.csv'.format(period, pair), index_col=0)
    close_lag_fields = ['close-{}'.format(n+1) for n in range(lag)]
    for n in range(1, lag+1):
        df['close-' + str(n)] = df['close'] - df.shift(n)['close']
    df['up'] = df['close'] < df.shift(-1)['close']
    df.dropna(inplace=True)
    X = df[close_lag_fields].values
    y = df['up'].astype(int).values

    scaler = StandardScaler()

    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
    
    model = RandomForestClassifier(n_jobs=4, random_state=0, n_estimators=20)
    validate_model('Random forest', model, X_train, y_train, X_test, y_test, period, pair, lag)
    
    model = SGDClassifier(loss="hinge", penalty="l2")
    validate_model('SGD', model, X_train, y_train, X_test, y_test, period, pair, lag)
    
    model = LogisticRegression()
    validate_model('Logistic regression', model, X_train, y_train, X_test, y_test, period, pair, lag)
    
    model = GradientBoostingClassifier(max_depth=4)
    validate_model('Gradient boosting', model, X_train, y_train, X_test, y_test, period, pair, lag)
    
    model = MLPClassifier()
    validate_model('MLP', model, X_train, y_train, X_test, y_test, period, pair, lag)
    
    model = DecisionTreeClassifier()
    validate_model('Decision tree', model, X_train, y_train, X_test, y_test, period, pair, lag)

    model = KNeighborsClassifier()
    validate_model('KNeighbours', model, X_train, y_train, X_test, y_test, period, pair, lag)

    print()


In [36]:
for period in ['5min', '30min', 'day']:
    for lag in range(2, 13):
        for pair in ['BTC_ETC', 'BTC_LTC']:
            test_cls(period, pair, lag)

0.528: [period: 5min](lag: 2): Random forest | BTC_ETC
0.544: [period: 5min](lag: 2): SGD | BTC_ETC
0.542: [period: 5min](lag: 2): Logistic regression | BTC_ETC
0.563: [period: 5min](lag: 2): Gradient boosting | BTC_ETC
0.563: [period: 5min](lag: 2): MLP | BTC_ETC
0.518: [period: 5min](lag: 2): Decision tree | BTC_ETC
0.533: [period: 5min](lag: 2): KNeighbours | BTC_ETC

0.537: [period: 5min](lag: 2): Random forest | BTC_LTC
0.542: [period: 5min](lag: 2): SGD | BTC_LTC
0.541: [period: 5min](lag: 2): Logistic regression | BTC_LTC
0.575: [period: 5min](lag: 2): Gradient boosting | BTC_LTC
0.570: [period: 5min](lag: 2): MLP | BTC_LTC
0.526: [period: 5min](lag: 2): Decision tree | BTC_LTC
0.539: [period: 5min](lag: 2): KNeighbours | BTC_LTC

0.541: [period: 5min](lag: 3): Random forest | BTC_ETC
0.548: [period: 5min](lag: 3): SGD | BTC_ETC
0.548: [period: 5min](lag: 3): Logistic regression | BTC_ETC
0.569: [period: 5min](lag: 3): Gradient boosting | BTC_ETC
0.567: [period: 5min](lag: 3): M

0.553: [period: 5min](lag: 12): KNeighbours | BTC_LTC

0.500: [period: 30min](lag: 2): Random forest | BTC_ETC
0.477: [period: 30min](lag: 2): SGD | BTC_ETC
0.528: [period: 30min](lag: 2): Logistic regression | BTC_ETC
0.533: [period: 30min](lag: 2): Gradient boosting | BTC_ETC
0.529: [period: 30min](lag: 2): MLP | BTC_ETC
0.498: [period: 30min](lag: 2): Decision tree | BTC_ETC
0.507: [period: 30min](lag: 2): KNeighbours | BTC_ETC

0.534: [period: 30min](lag: 2): Random forest | BTC_LTC
0.530: [period: 30min](lag: 2): SGD | BTC_LTC
0.529: [period: 30min](lag: 2): Logistic regression | BTC_LTC
0.554: [period: 30min](lag: 2): Gradient boosting | BTC_LTC
0.561: [period: 30min](lag: 2): MLP | BTC_LTC
0.514: [period: 30min](lag: 2): Decision tree | BTC_LTC
0.538: [period: 30min](lag: 2): KNeighbours | BTC_LTC

0.524: [period: 30min](lag: 3): Random forest | BTC_ETC
0.536: [period: 30min](lag: 3): SGD | BTC_ETC
0.543: [period: 30min](lag: 3): Logistic regression | BTC_ETC
0.545: [period: 30m

0.580: [period: 30min](lag: 12): Gradient boosting | BTC_LTC
0.582: [period: 30min](lag: 12): MLP | BTC_LTC
0.526: [period: 30min](lag: 12): Decision tree | BTC_LTC
0.550: [period: 30min](lag: 12): KNeighbours | BTC_LTC

0.575: [period: day](lag: 2): Random forest | BTC_ETC
0.658: [period: day](lag: 2): SGD | BTC_ETC
0.644: [period: day](lag: 2): Logistic regression | BTC_ETC
0.575: [period: day](lag: 2): Gradient boosting | BTC_ETC
0.658: [period: day](lag: 2): MLP | BTC_ETC
0.534: [period: day](lag: 2): Decision tree | BTC_ETC
0.479: [period: day](lag: 2): KNeighbours | BTC_ETC

0.534: [period: day](lag: 2): Random forest | BTC_LTC
0.630: [period: day](lag: 2): SGD | BTC_LTC
0.658: [period: day](lag: 2): Logistic regression | BTC_LTC
0.616: [period: day](lag: 2): Gradient boosting | BTC_LTC
0.644: [period: day](lag: 2): MLP | BTC_LTC
0.479: [period: day](lag: 2): Decision tree | BTC_LTC
0.507: [period: day](lag: 2): KNeighbours | BTC_LTC

0.521: [period: day](lag: 3): Random forest |