In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [2]:
get_data.get('data/datas.csv', period='6-hour', market='bitstampUSD')

Loading.....
366 of 2110 days loaded...
732 of 2110 days loaded...
1098 of 2110 days loaded...
1464 of 2110 days loaded...
1830 of 2110 days loaded...
2196 of 2110 days loaded...
Last Timestamp: 2017-06-22 14:00:00


### Load Data

In [2]:
df = pd.read_csv('data/datas.csv', sep=',')

In [3]:
df.shape

(8442, 8)

### Preprocessing

Drop rows with read problems (0.00, 2.7e+308) values

In [4]:
df = utils.dropna(df)

In [5]:
df.shape

(8046, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [6]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [7]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 8046, Number of columns: 9
Number of UP rows: 451, Number of DOWN rows: 351


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [8]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [9]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [10]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 8046, Number of columns: 22


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [11]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean']
for col in cols:
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df[col].shift(prev)
df = df.dropna()

In [12]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 8036, Number of columns: 132


### Split

In [13]:
train, test = utils.split_df2(df)

In [14]:
excl = ['Open', 'High', 'Low', 'Close','Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [45]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.356701730044
Accuracy: 0.891488302638
Coeficiente Kappa: 0.0597577914843
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1793
         UP       0.29      0.03      0.06       126
       DOWN       0.40      0.02      0.04        90

avg / total       0.84      0.89      0.85      2009

Confusion Matrix:

Predicted     0   1  2  __all__
Actual                         
0          1785   7  1     1793
1           120   4  2      126
2            85   3  2       90
__all__    1990  14  5     2009


Overall Statistics:

Accuracy: 0.891488302638
95% CI: (0.87706339928689947, 0.90475892063868524)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0597577914843
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0           1           2
Population                                  2009        2009        2009
P: Condition positive                       1793      

### Bagging

In [16]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.745588275538
Accuracy: 0.887537993921
Coeficiente Kappa: 0.0712340438158
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1766
         UP       0.20      0.04      0.07       123
       DOWN       0.08      0.01      0.02        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1746  15   5     1766
2           112   5   6      123
3            79   5   1       85
__all__    1937  25  12     1974


Overall Statistics:

Accuracy: 0.887537993921
95% CI: (0.87276833005262688, 0.90114495955282337)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0712340438158
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766  

### ExtraTrees

In [17]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.53233703279
Accuracy: 0.88905775076
Coeficiente Kappa: 0.11489063738
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.99      0.94      1766
         UP       0.26      0.07      0.10       123
       DOWN       0.19      0.04      0.06        85

avg / total       0.83      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1744  14   8     1766
2           110   8   5      123
3            73   9   3       85
__all__    1927  31  16     1974


Overall Statistics:

Accuracy: 0.88905775076
95% CI: (0.87436822397740932, 0.90257985936339491)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.11489063738
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766        1

### GradientBoosting

In [18]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.827814901418
Accuracy: 0.88399189463
Coeficiente Kappa: 0.113451030999
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.98      0.94      1766
         UP       0.22      0.07      0.11       123
       DOWN       0.16      0.04      0.06        85

avg / total       0.83      0.88      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1733  23  10     1766
2           108   9   6      123
3            73   9   3       85
__all__    1914  41  19     1974


Overall Statistics:

Accuracy: 0.88399189463
95% CI: (0.86903834395992885, 0.89779374250552457)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.113451030999
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766      

### AdaBoost

In [19]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.09385283936
Accuracy: 0.892097264438
Coeficiente Kappa: 0.00799339388935
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1766
         UP       0.20      0.01      0.02       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.81      0.89      0.84      1974

Confusion Matrix:

Predicted     1  2  3  __all__
Actual                        
1          1760  4  2     1766
2           121  1  1      123
3            85  0  0       85
__all__    1966  5  3     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.00799339388935
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766        

### VotingClassifier

In [20]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


Accuracy: 0.891590678825
Coeficiente Kappa: 0.0686544261601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.95      1766
         UP       0.26      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1755   9  2     1766
2           113   5  5      123
3            80   5  0       85
__all__    1948  19  7     1974


Overall Statistics:

Accuracy: 0.891590678825
95% CI: (0.87703653818171834, 0.90496952399186592)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0686544261601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

In [21]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('gr', gr)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.892097264438
Coeficiente Kappa: 0.0627428339601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1766
         UP       0.29      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1756   8  2     1766
2           114   5  4      123
3            81   4  0       85
__all__    1951  17  6     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0627428339601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

In [16]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
"""
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
"""
num_boost_rounds = 756
print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

y_pred = model.predict(dtest)
y_true = test['Target']

utils.metrics(y_true, y_pred)

756
Accuracy: 0.892483822797
Coeficiente Kappa: 0.0648927504752
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1793
         UP       0.40      0.05      0.09       126
       DOWN       0.33      0.01      0.02        90

avg / total       0.84      0.89      0.85      2009

Confusion Matrix:

Predicted   0.0  1.0  2.0  __all__
Actual                            
0.0        1786    7    0     1793
1.0         118    6    2      126
2.0          87    2    1       90
__all__    1991   15    3     2009


Overall Statistics:

Accuracy: 0.892483822797
95% CI: (0.87811227508338807, 0.9056980410486779)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0648927504752
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                       0.0         1.0         2.0
Population                                   2009        2009        2009
P: Condition positive                        1793         126

In [18]:
from sklearn.metrics import *

print('Accuracy: {}'.format(accuracy_score(y_true, y_pred)))
# print('Coeficiente Kappa: {}'.format(cohen_kappa_score(y_true, y_pred)))
print "Confussion Matrix:"
print confusion_matrix(y_true, y_pred)

Accuracy: 0.892483822797
Confussion Matrix:
[[1786    7    0]
 [ 118    6    2]
 [  87    2    1]]


In [19]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Close-7', 124),
 ('Close-9', 129),
 ('Weighted_Price-8', 132),
 ('Weighted_Price-5', 134),
 ('Close-5', 170),
 ('Weighted_Price-9', 179),
 ('Close-6', 180),
 ('Close-3', 182),
 ('Close-8', 189),
 ('Close-10', 195),
 ('Close-4', 200),
 ('Weighted_Price-3', 219),
 ('Close-2', 224),
 ('Weighted_Price-10', 246),
 ('Weighted_Price-7', 286),
 ('Weighted_Price-2', 295),
 ('Weighted_Price-6', 295),
 ('High-9', 331),
 ('Weighted_Price-4', 396),
 ('Low-7', 424),
 ('High-6', 441),
 ('High-7', 463),
 ('Low-6', 488),
 ('High-5', 501),
 ('High-4', 539),
 ('Low-9', 548),
 ('High-8', 554),
 ('High-3', 563),
 ('Low-8', 572),
 ('High-2', 575),
 ('Low-4', 580),
 ('Low-10', 596),
 ('Year', 621),
 ('High-10', 632),
 ('Weighted_Price-1', 664),
 ('Low-5', 794),
 ('Low-2', 891),
 ('Open-8', 981),
 ('Open-7', 993),
 ('Open-6', 1008),
 ('Open-5', 1046),
 ('High-1', 1121),
 ('Low-3', 1194),
 ('Open-9', 1227),
 ('Open-4', 1232),
 ('Open-3', 1254),
 ('Close-1', 1270),
 ('Low-1', 1342),
 ('Open-2', 1493),
 ('Ope

In [20]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Weighted_Price-6', 0.5957750148074574),
 ('Year', 0.6197839839001609),
 ('Week', 0.7020748236126422),
 ('Volume_BTC-7', 0.7305912394242583),
 ('Day', 0.7327996313422134),
 ('Close-open-5', 0.7373282468116058),
 ('Month', 0.7386916602798842),
 ('Close-open-10', 0.7428657889155549),
 ('Volume_BTC-6', 0.7478980770011783),
 ('Volume_BTC-5', 0.7498979976230021),
 ('Close-open-8', 0.7912348557582992),
 ('Volume_BTC-3', 0.7998627868920614),
 ('Close-open-2', 0.8020697324867831),
 ('Weekday', 0.8104826137797139),
 ('Volume_BTC-4', 0.820690728570925),
 ('Close-open-9', 0.832916263688126),
 ('Volume_BTC-8', 0.8341002583573637),
 ('Volume_Currency-6', 0.8363203694976166),
 ('Close-open-7', 0.8389662971651104),
 ('Close-open-4', 0.8404302557226205),
 ('Open-5', 0.8424068035086046),
 ('Volume_BTC-10', 0.8502030040497354),
 ('Volume_Currency-7', 0.8550742900887157),
 ('High-low-10', 0.8627968021309639),
 ('Close-open-6', 0.8756298357601816),
 ('High-low-5', 0.8758025429768983),
 ('Close-8', 0.877