In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [2]:
get_data.get('data/datas.csv', period='6-hour', market='bitstampUSD')

Loading.....
366 of 2075 days loaded...
732 of 2075 days loaded...
1098 of 2075 days loaded...
1464 of 2075 days loaded...
1830 of 2075 days loaded...
2196 of 2075 days loaded...
Last Timestamp: 2017-05-18 14:00:00


### Load Data

In [3]:
df = pd.read_csv('data/datas.csv', sep=',')

In [4]:
df.shape

(8302, 8)

### Preprocessing

Drop rows with read problems (0.00, 2.7e+308) values

In [5]:
df = utils.dropna(df)

In [6]:
df.shape

(7906, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [7]:
df['Target'] = 1 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 2 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 3 # 'DOWN'

In [8]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 2]), len(df[df.Target == 3])))

Number of rows: 7906, Number of columns: 9
Number of UP rows: 431, Number of DOWN rows: 336


Create columns from Timestamp to Date, Year, Month and Day.

In [9]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [10]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 7906, Number of columns: 13


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [11]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price']
for i in cols:
    col = i
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df.Close.shift(prev)
df = df.dropna()

In [12]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 7896, Number of columns: 83


### Split

In [13]:
train, test = utils.split_df2(df)

In [14]:
excl = ['Open', 'High', 'Low', 'Close','Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'Date', 'Year', 'Day', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [15]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.612690026124
Accuracy: 0.892603850051
Coeficiente Kappa: 0.080918800087
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.95      1766
         UP       0.29      0.04      0.07       123
       DOWN       0.20      0.02      0.04        85

avg / total       0.83      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1755   8   3     1766
2           113   5   5      123
3            79   4   2       85
__all__    1947  17  10     1974


Overall Statistics:

Accuracy: 0.892603850051
95% CI: (0.87810451536194267, 0.90592473444619248)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.080918800087
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        176

### Bagging

In [16]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.745588275538
Accuracy: 0.887537993921
Coeficiente Kappa: 0.0712340438158
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1766
         UP       0.20      0.04      0.07       123
       DOWN       0.08      0.01      0.02        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1746  15   5     1766
2           112   5   6      123
3            79   5   1       85
__all__    1937  25  12     1974


Overall Statistics:

Accuracy: 0.887537993921
95% CI: (0.87276833005262688, 0.90114495955282337)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0712340438158
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766  

### ExtraTrees

In [17]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.53233703279
Accuracy: 0.88905775076
Coeficiente Kappa: 0.11489063738
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.99      0.94      1766
         UP       0.26      0.07      0.10       123
       DOWN       0.19      0.04      0.06        85

avg / total       0.83      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1744  14   8     1766
2           110   8   5      123
3            73   9   3       85
__all__    1927  31  16     1974


Overall Statistics:

Accuracy: 0.88905775076
95% CI: (0.87436822397740932, 0.90257985936339491)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.11489063738
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766        1

### GradientBoosting

In [18]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.827814901418
Accuracy: 0.88399189463
Coeficiente Kappa: 0.113451030999
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.98      0.94      1766
         UP       0.22      0.07      0.11       123
       DOWN       0.16      0.04      0.06        85

avg / total       0.83      0.88      0.85      1974

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1733  23  10     1766
2           108   9   6      123
3            73   9   3       85
__all__    1914  41  19     1974


Overall Statistics:

Accuracy: 0.88399189463
95% CI: (0.86903834395992885, 0.89779374250552457)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.113451030999
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1974       1974        1974
P: Condition positive                       1766      

### AdaBoost

In [19]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.09385283936
Accuracy: 0.892097264438
Coeficiente Kappa: 0.00799339388935
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1766
         UP       0.20      0.01      0.02       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.81      0.89      0.84      1974

Confusion Matrix:

Predicted     1  2  3  __all__
Actual                        
1          1760  4  2     1766
2           121  1  1      123
3            85  0  0       85
__all__    1966  5  3     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.00799339388935
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766        

### VotingClassifier

In [20]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


Accuracy: 0.891590678825
Coeficiente Kappa: 0.0686544261601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.95      1766
         UP       0.26      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1755   9  2     1766
2           113   5  5      123
3            80   5  0       85
__all__    1948  19  7     1974


Overall Statistics:

Accuracy: 0.891590678825
95% CI: (0.87703653818171834, 0.90496952399186592)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0686544261601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

In [21]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('gr', gr)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.892097264438
Coeficiente Kappa: 0.0627428339601
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1766
         UP       0.29      0.04      0.07       123
       DOWN       0.00      0.00      0.00        85

avg / total       0.82      0.89      0.85      1974

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1756   8  2     1766
2           114   5  4      123
3            81   4  0       85
__all__    1951  17  6     1974


Overall Statistics:

Accuracy: 0.892097264438
95% CI: (0.87757047966540269, 0.90544717661034702)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0627428339601
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1974        1974        1974
P: Condition positive                        1766         123          85
N: Condi

### Conclusions

In progress...