In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [25]:
get_data.get('data/datas.csv', period='6-hour', market='bitstampUSD')

Loading.....
366 of 2043 days loaded...
732 of 2043 days loaded...
1098 of 2043 days loaded...
1464 of 2043 days loaded...
1830 of 2043 days loaded...
2196 of 2043 days loaded...
Last Timestamp: 2017-04-16 20:00:00


### Load Data

In [26]:
df = pd.read_csv('data/datas.csv', sep=',')

In [27]:
df.shape

(8175, 8)

### Preprocessing

Drop rows with read problems (0.00, 2.7e+308) values

In [28]:
df = utils.dropna(df)

In [29]:
df.shape

(7779, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [30]:
df['Target'] = 1 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 2 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 3 # 'DOWN'

In [31]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 2]), len(df[df.Target == 3])))

Number of rows: 7779, Number of columns: 9
Number of UP rows: 424, Number of DOWN rows: 332


Create columns from Timestamp to Date, Year, Month and Day.

In [32]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [33]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 7779, Number of columns: 13


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [34]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price']
for i in cols:
    col = i
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df.Close.shift(prev)
df = df.dropna()

In [35]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 7769, Number of columns: 83


### Split

In [36]:
train, test = utils.split_df2(df)

In [37]:
excl = ['Open', 'High', 'Low', 'Close','Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'Date', 'Year', 'Day', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [38]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.585755559915
Accuracy: 0.891919711786
Coeficiente Kappa: 0.0874792573343
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1738
         UP       0.29      0.05      0.08       122
       DOWN       0.22      0.02      0.04        83

avg / total       0.83      0.89      0.85      1943

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1725  11  2     1738
2           111   6  5      122
3            77   4  2       83
__all__    1913  21  9     1943


Overall Statistics:

Accuracy: 0.891919711786
95% CI: (0.87726101096290243, 0.90538311657198989)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0874792573343
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1943        1943        1943
P: Condition positive                        1738   

### Bagging

In [39]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.73713731679
Accuracy: 0.887802367473
Coeficiente Kappa: 0.0829448888793
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1738
         UP       0.20      0.04      0.07       122
       DOWN       0.14      0.02      0.04        83

avg / total       0.83      0.89      0.85      1943

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1718  14   6     1738
2           111   5   6      122
3            75   6   2       83
__all__    1904  25  14     1943


Overall Statistics:

Accuracy: 0.887802367473
95% CI: (0.87292258579787707, 0.90149986381066416)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0829448888793
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1943       1943        1943
P: Condition positive                       1738   

### ExtraTrees

In [40]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.577331707282
Accuracy: 0.888831703551
Coeficiente Kappa: 0.110408329377
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1738
         UP       0.27      0.06      0.09       122
       DOWN       0.16      0.04      0.06        83

avg / total       0.83      0.89      0.85      1943

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1717  12   9     1738
2           108   7   7      122
3            73   7   3       83
__all__    1898  26  19     1943


Overall Statistics:

Accuracy: 0.888831703551
95% CI: (0.8740066219192163, 0.90247125058049571)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.110408329377
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1943       1943        1943
P: Condition positive                       1738     

### GradientBoosting

In [41]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.825085769616
Accuracy: 0.88368502316
Coeficiente Kappa: 0.100050006558
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.98      0.94      1738
         UP       0.19      0.06      0.09       122
       DOWN       0.21      0.05      0.08        83

avg / total       0.83      0.88      0.85      1943

Confusion Matrix:

Predicted     1   2   3  __all__
Actual                          
1          1706  23   9     1738
2           109   7   6      122
3            73   6   4       83
__all__    1888  36  19     1943


Overall Statistics:

Accuracy: 0.88368502316
95% CI: (0.8685901227852425, 0.89761061385484486)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.100050006558
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                  1943       1943        1943
P: Condition positive                       1738       

### AdaBoost

In [42]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.09387444573
Accuracy: 0.892949047864
Coeficiente Kappa: 0.00966700891227
Report:              precision    recall  f1-score   support

       KEEP       0.90      1.00      0.94      1738
         UP       0.25      0.01      0.02       122
       DOWN       0.00      0.00      0.00        83

avg / total       0.82      0.89      0.85      1943

Confusion Matrix:

Predicted     1  2  3  __all__
Actual                        
1          1734  3  1     1738
2           120  1  1      122
3            83  0  0       83
__all__    1937  4  2     1943


Overall Statistics:

Accuracy: 0.892949047864
95% CI: (0.87834658328135895, 0.90635295787432801)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.00966700891227
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1           2           3
Population                                  1943        1943        1943
P: Condition positive                       1738         12

### VotingClassifier

In [43]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


Accuracy: 0.890375707669
Coeficiente Kappa: 0.0676226415094
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1738
         UP       0.25      0.04      0.07       122
       DOWN       0.00      0.00      0.00        83

avg / total       0.82      0.89      0.85      1943

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1725  10  3     1738
2           112   5  5      122
3            78   5  0       83
__all__    1915  20  8     1943


Overall Statistics:

Accuracy: 0.890375707669
95% CI: (0.87563338444965144, 0.90392761817394152)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0676226415094
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1943        1943        1943
P: Condition positive                        1738         122          83
N: Condi

In [44]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('gr', gr)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.891919711786
Coeficiente Kappa: 0.0635435385812
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.99      0.94      1738
         UP       0.24      0.03      0.06       122
       DOWN       0.17      0.01      0.02        83

avg / total       0.83      0.89      0.85      1943

Confusion Matrix:

Predicted     1   2  3  __all__
Actual                         
1          1728   9  1     1738
2           114   4  4      122
3            78   4  1       83
__all__    1920  17  6     1943


Overall Statistics:

Accuracy: 0.891919711786
95% CI: (0.87726101096290243, 0.90538311657198989)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.0635435385812
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         1           2           3
Population                                   1943        1943        1943
P: Condition positive                        1738         122          83
N: Condi

### Conclusions

In progress...