In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [30]:
# get_data.get('data/datas.csv', period='6-hour', market='bitstampUSD')

### Load Data

In [31]:
df = pd.read_csv('data/datas.csv', sep=',')

In [32]:
df.shape

(8457, 8)

### Preprocessing

Drop rows with read problems (0.00, 2.7e+308) values

In [33]:
df = utils.dropna(df)

In [34]:
df.shape

(8061, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [35]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [36]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 8061, Number of columns: 9
Number of UP rows: 181, Number of DOWN rows: 152


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [37]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [38]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [39]:
# rollings: https://github.com/pandas-dev/pandas/blob/master/pandas/stats/moments.py
df['Rolling_mean_3'] = df.set_index('Date')['Close'].rolling(window=3).mean().values
df['Rolling_std_3'] = df.set_index('Date')['Close'].rolling(window=3).std().values
df['Rolling_cov_3'] = df.set_index('Date')['Close'].rolling(window=3).cov().values

df['Rolling_mean_3'] = df['Rolling_mean_3'].shift(-1)
df['Rolling_std_3'] = df['Rolling_std_3'].shift(-1)
df['Rolling_cov_3'] = df['Rolling_cov_3'].shift(-1)

df['Rolling_mean_5'] = df.set_index('Date')['Close'].rolling(window=5).mean().values
df['Rolling_std_5'] = df.set_index('Date')['Close'].rolling(window=5).std().values
df['Rolling_cov_5'] = df.set_index('Date')['Close'].rolling(window=5).cov().values

df['Rolling_mean_5'] = df['Rolling_mean_5'].shift(-1)
df['Rolling_std_5'] = df['Rolling_std_5'].shift(-1)
df['Rolling_cov_5'] = df['Rolling_cov_5'].shift(-1)

df['Rolling_mean_10'] = df.set_index('Date')['Close'].rolling(window=10).mean().values
df['Rolling_std_10'] = df.set_index('Date')['Close'].rolling(window=10).std().values
df['Rolling_cov_10'] = df.set_index('Date')['Close'].rolling(window=10).cov().values

df['Rolling_mean_10'] = df['Rolling_mean_10'].shift(-1)
df['Rolling_std_10'] = df['Rolling_std_10'].shift(-1)
df['Rolling_cov_10'] = df['Rolling_cov_10'].shift(-1)

In [40]:
# daily return

df['Daily_return'] = (df.Close/df.Open) - 1
df['Daily_return_100'] = ((df.Close/df.Open) - 1) * 100

In [41]:
# cumulative return

df['Cumulative_return'] = (df.Close/df['Close'][0]) - 1
df['Cumulative_return_100'] = (df.Close/df['Close'][0]) - 1

# cumulative return week, month, year...

In [42]:
# technical analysis (price and volume)

# momentum
df['Momentum_3'] = 
df['Momentum_7'] = 
df['Momentum_10'] = 

# simple moving average

# bollinger bands



In [43]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 8061, Number of columns: 35


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [44]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df[col].shift(prev)
df = df.dropna()

In [45]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 8050, Number of columns: 185


### Split

In [46]:
train, test = utils.split_df2(df)

In [47]:
excl = ['Open', 'High', 'Low', 'Close','Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [99]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.304556441742
Accuracy: 0.891703924491
Coeficiente Kappa: 0.0802266139326
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1796
         UP       0.33      0.05      0.08       126
       DOWN       0.33      0.02      0.04        91

avg / total       0.84      0.89      0.85      2013

Confusion Matrix:

Predicted     0   1  2  __all__
Actual                         
0          1787   7  2     1796
1           118   6  2      126
2            84   5  2       91
__all__    1989  18  6     2013


Overall Statistics:

Accuracy: 0.891703924491
95% CI: (0.87730570401950381, 0.90494952037687759)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0802266139326
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         0          1           2
Population                                   2013       2013        2013
P: Condition positive                        1796     

### Bagging

In [79]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.378199070545
Accuracy: 0.890213611525
Coeficiente Kappa: 0.0991665417964
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1796
         UP       0.33      0.07      0.12       126
       DOWN       0.14      0.01      0.02        91

avg / total       0.83      0.89      0.85      2013

Confusion Matrix:

Predicted     0   1  2  __all__
Actual                         
0          1782  12  2     1796
1           113   9  4      126
2            84   6  1       91
__all__    1979  27  7     2013


Overall Statistics:

Accuracy: 0.890213611525
95% CI: (0.87573614547542811, 0.90354299752495082)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0991665417964
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0           1           2
Population                                  2013        2013        2013
P: Condition positive                       1796      

### ExtraTrees

In [78]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.367189983351
Accuracy: 0.889716840537
Coeficiente Kappa: 0.0467367469006
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1796
         UP       0.17      0.02      0.03       126
       DOWN       0.14      0.01      0.02        91

avg / total       0.82      0.89      0.84      2013

Confusion Matrix:

Predicted     0   1  2  __all__
Actual                         
0          1788   7  1     1796
1           119   2  5      126
2            87   3  1       91
__all__    1994  12  7     2013


Overall Statistics:

Accuracy: 0.889716840537
95% CI: (0.87521313541808388, 0.90307397942249457)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0467367469006
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         0           1           2
Population                                   2013        2013        2013
P: Condition positive                        1796   

### GradientBoosting

In [18]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.827814901418
Accuracy: 0.88399189463
Coeficiente Kappa: 0.113451030999
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.98      0.94      1766
         UP       0.22      0.07      0.11       123
       DOWN       0.16      0.04      0.06        85

avg / total       0.83      0.88      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1733  23  10     1766
2           108   9   6      123
3            73   9   3       85
__all__    1914  41  19     1974


Overall Statistics:

Accuracy: 0.88399189463
95% CI: (0.86903834395992885, 0.89779374250552457)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.113451030999
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766      

### AdaBoost

In [19]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.09385283936
Accuracy: 0.892097264438
Coeficiente Kappa: 0.00799339388935
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1766
         UP       0.20      0.01      0.02       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.81      0.89      0.84      1974

Confusion Matrix:

Predicted     1  2  3  __all__
Actual                        
1          1760  4  2     1766
2           121  1  1      123
3            85  0  0       85
__all__    1966  5  3     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.00799339388935
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766        

### VotingClassifier

In [20]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


Accuracy: 0.891590678825
Coeficiente Kappa: 0.0686544261601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.95      1766
         UP       0.26      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1755   9  2     1766
2           113   5  5      123
3            80   5  0       85
__all__    1948  19  7     1974


Overall Statistics:

Accuracy: 0.891590678825
95% CI: (0.87703653818171834, 0.90496952399186592)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0686544261601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

In [21]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('gr', gr)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.892097264438
Coeficiente Kappa: 0.0627428339601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1766
         UP       0.29      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1756   8  2     1766
2           114   5  4      123
3            81   4  0       85
__all__    1951  17  6     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0627428339601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

# xgboost

In [48]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)

#num_boost_rounds = 756

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

y_pred = model.predict(dtest)
y_true = test['Target']

# utils.metrics(y_true, y_pred)
utils.metrics2(y_true, y_pred)

[0]	train-mlogloss:1.0923	test-mlogloss:1.09255
[50]	train-mlogloss:0.831088	test-mlogloss:0.843383
[100]	train-mlogloss:0.645702	test-mlogloss:0.668392
[150]	train-mlogloss:0.508775	test-mlogloss:0.54093
[200]	train-mlogloss:0.405045	test-mlogloss:0.445697
[250]	train-mlogloss:0.324965	test-mlogloss:0.373407
[300]	train-mlogloss:0.262355	test-mlogloss:0.317884
[350]	train-mlogloss:0.212897	test-mlogloss:0.274943
[400]	train-mlogloss:0.173563	test-mlogloss:0.241544
[450]	train-mlogloss:0.142023	test-mlogloss:0.215451
[500]	train-mlogloss:0.116754	test-mlogloss:0.195002
[550]	train-mlogloss:0.0964013	test-mlogloss:0.179107
[600]	train-mlogloss:0.0799527	test-mlogloss:0.166819
[650]	train-mlogloss:0.066649	test-mlogloss:0.157379
[700]	train-mlogloss:0.0558903	test-mlogloss:0.150134
[750]	train-mlogloss:0.0471397	test-mlogloss:0.14468
[800]	train-mlogloss:0.040001	test-mlogloss:0.140676
[850]	train-mlogloss:0.0341773	test-mlogloss:0.13777
[900]	train-mlogloss:0.0294187	test-mlogloss:0.135

In [49]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_3', 1),
 ('Close-3', 27),
 ('Close-9', 35),
 ('Weighted_Price-5', 40),
 ('Close-10', 53),
 ('Close-5', 55),
 ('Close-4', 63),
 ('Weighted_Price-8', 63),
 ('Close-6', 64),
 ('Close-2', 73),
 ('Weighted_Price-9', 79),
 ('Close-8', 83),
 ('Weighted_Price-3', 84),
 ('Weighted_Price-10', 99),
 ('Weighted_Price-6', 108),
 ('Weighted_Price-4', 109),
 ('Low-5', 163),
 ('Close-7', 173),
 ('Weighted_Price-7', 185),
 ('Low-9', 191),
 ('Low-10', 210),
 ('Low-7', 217),
 ('Low-8', 226),
 ('Weighted_Price-2', 237),
 ('High-9', 241),
 ('High-6', 261),
 ('Year', 261),
 ('High-8', 289),
 ('Weighted_Price-1', 299),
 ('Open-4', 323),
 ('High-2', 356),
 ('Low-6', 359),
 ('High-3', 372),
 ('Open-5', 385),
 ('High-7', 388),
 ('High-10', 407),
 ('Open-10', 418),
 ('High-4', 421),
 ('Open-7', 444),
 ('Low-3', 477),
 ('Open-2', 486),
 ('Open-8', 506),
 ('Open-9', 525),
 ('High-5', 531),
 ('Open-3', 539),
 ('High-1', 569),
 ('Open-6', 585),
 ('Rolling_mean_10', 586),
 ('Low-2', 604),
 ('Close-1', 

In [50]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_3', 0.283062),
 ('Volume_BTC-6', 0.54913567398136),
 ('Volume_BTC-5', 0.549397437512129),
 ('Close-open-5', 0.5742087381478608),
 ('Weekday', 0.5937165063791596),
 ('Close-2', 0.5976722636986302),
 ('Day', 0.5977523443722788),
 ('Volume_Currency-4', 0.5981558209299579),
 ('Close-open-10', 0.6214691654187184),
 ('Close-6', 0.622954784375),
 ('Volume_BTC-7', 0.6241601319735293),
 ('Close-open-1', 0.625957520254697),
 ('Volume_BTC-2', 0.6273574386504271),
 ('Open-10', 0.6275084497272725),
 ('Volume_BTC-4', 0.647593499280287),
 ('High-low-3', 0.6517931748815158),
 ('Close-open-2', 0.6524081680194008),
 ('Volume_BTC-9', 0.6536004495116268),
 ('Close-open-7', 0.6586853951428567),
 ('Close-open-3', 0.6611472577375713),
 ('Week', 0.6617051344381956),
 ('Volume_Currency-7', 0.6731674536986693),
 ('Volume_Currency-6', 0.6735103593543655),
 ('yearweekday', 0.677639406182325),
 ('Daily_return-8', 0.6777655941476387),
 ('Volume_BTC-3', 0.6792541153437185),
 ('Daily_return-5', 0.68004

In [75]:
from sklearn.linear_model import *

In [76]:
# train
rf = LogisticRegression(max_iter=2000)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.893691008445
Coeficiente Kappa: 0.0877920383872
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.95      1796
         UP       0.45      0.04      0.07       126
       DOWN       0.40      0.04      0.08        91

avg / total       0.85      0.89      0.85      2013

Confusion Matrix:

Predicted     0   1   2  __all__
Actual                          
0          1790   3   3     1796
1           118   5   3      126
2            84   3   4       91
__all__    1992  11  10     2013


Overall Statistics:

Accuracy: 0.893691008445
95% CI: (0.87939970151524827, 0.90682362399512173)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0877920383872
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         0           1           2
Population                                   2013        2013        2013
P: Condition positive                        1796         126          91
N: