In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *

import settings
import utils

### Get and Load Data

In [2]:
# DOWNLOAD data and save in data folder (data/BCHARTS-BITSTAMPUSD.csv) https://www.quandl.com/api/v3/datasets/BCHARTS/BITSTAMPUSD/data.csv?order=asc
df = pd.read_csv(settings.PATH_DATA, sep=',')

In [3]:
df.rename(columns={'Volume (BTC)': 'Volume_BTC', 'Volume (Currency)': 'Volume_Currency', 'Weighted Price': 'Weighted_Price'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,2011-09-13,5.8,6.0,5.65,5.97,58.371382,346.097389,5.929231
1,2011-09-14,5.58,5.72,5.52,5.53,61.145984,341.854813,5.590798
2,2011-09-15,5.12,5.24,5.0,5.13,80.140795,408.259002,5.094272
3,2011-09-16,4.82,4.87,4.8,4.85,39.914007,193.763147,4.854515
4,2011-09-17,4.87,4.87,4.87,4.87,0.3,1.461,4.87


### Preprocessing

Drop rows with 0.00 values

In [5]:
df = utils.dropna(df)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [6]:
df['Target'] = 1 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 2 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 3 # 'DOWN'

In [7]:
df.head(20)

Unnamed: 0,Date,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price,Target
0,2011-09-13,5.8,6.0,5.65,5.97,58.371382,346.097389,5.929231,1
1,2011-09-14,5.58,5.72,5.52,5.53,61.145984,341.854813,5.590798,1
2,2011-09-15,5.12,5.24,5.0,5.13,80.140795,408.259002,5.094272,1
3,2011-09-16,4.82,4.87,4.8,4.85,39.914007,193.763147,4.854515,1
4,2011-09-17,4.87,4.87,4.87,4.87,0.3,1.461,4.87,1
5,2011-09-18,4.87,4.92,4.81,4.92,119.8128,579.843103,4.839576,1
6,2011-09-19,4.9,4.9,4.9,4.9,20.0,98.0,4.9,1
7,2011-09-20,4.92,5.66,4.92,5.66,89.280711,481.049263,5.388054,2
8,2011-09-21,5.7,5.79,5.66,5.66,17.629322,100.594234,5.706075,1
9,2011-09-22,5.68,5.72,5.68,5.72,43.778422,249.403941,5.69696,1


In [8]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 2]), len(df[df.Target == 3])))

Number of rows: 2018, Number of columns: 9
Number of UP rows: 183, Number of DOWN rows: 126


Create columns from Date to Day, Month and Year.

In [9]:
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = 0
df['Year'] = df['Date'].dt.year

df['Month'] = 0
df['Month'] = df['Date'].dt.month

df['Day'] = 0
df['Day'] = df['Date'].dt.day

In [10]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 2018, Number of columns: 12


Transformation previous values from Open, High, Low and Close columns.

In [11]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price']
for i in cols:
    col = i
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df.Close.shift(prev)
df = df.dropna()

In [12]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 2008, Number of columns: 82


In [13]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price,Target,Year,...,Weighted_Price-1,Weighted_Price-2,Weighted_Price-3,Weighted_Price-4,Weighted_Price-5,Weighted_Price-6,Weighted_Price-7,Weighted_Price-8,Weighted_Price-9,Weighted_Price-10
10,2011-09-23,5.7,5.72,5.7,5.72,20.680037,118.023015,5.707099,1,2011,...,5.72,5.66,5.66,4.9,4.92,4.87,4.85,5.13,5.53,5.97
11,2011-09-24,5.7,5.76,5.68,5.68,90.587066,518.383945,5.722494,1,2011,...,5.72,5.72,5.66,5.66,4.9,4.92,4.87,4.85,5.13,5.53
12,2011-09-25,6.05,6.05,6.05,6.05,12.22,73.931,6.05,1,2011,...,5.68,5.72,5.72,5.66,5.66,4.9,4.92,4.87,4.85,5.13
13,2011-09-26,6.06,6.06,4.8,4.8,39.578463,236.800854,5.983073,3,2011,...,6.05,5.68,5.72,5.72,5.66,5.66,4.9,4.92,4.87,4.85
14,2011-09-27,4.85,4.92,4.85,4.92,24.3545,119.22506,4.895402,1,2011,...,4.8,6.05,5.68,5.72,5.72,5.66,5.66,4.9,4.92,4.87


### Split

In [14]:
train, test = utils.split_df2(df)

In [15]:
excl = ['Open', 'High', 'Low', 'Close', 'Target', 'Date', 'Year', 'Day']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [16]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.483762382449
Accuracy: 0.846613545817
Coeficiente Kappa: 0.260465294253
Report:              precision    recall  f1-score   support

       KEEP       0.89      0.95      0.92       430
         UP       0.36      0.26      0.30        38
       DOWN       0.46      0.18      0.26        34

avg / total       0.82      0.85      0.83       502

Confusion Matrix:

Predicted    1   2   3  __all__
Actual                         
1          409  15   6      430
2           27  10   1       38
3           25   3   6       34
__all__    461  28  13      502


Overall Statistics:

Accuracy: 0.846613545817
95% CI: (0.8120637086763679, 0.87701419388967428)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.999999961304
Kappa: 0.260465294253
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2          3
Population                                   502        502        502
P: Condition positive                        430  

### Bagging

In [17]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.642261428761
Accuracy: 0.854581673307
Coeficiente Kappa: 0.343732091691
Report:              precision    recall  f1-score   support

       KEEP       0.90      0.95      0.93       430
         UP       0.35      0.24      0.28        38
       DOWN       0.46      0.32      0.38        34

avg / total       0.83      0.85      0.84       502

Confusion Matrix:

Predicted    1   2   3  __all__
Actual                         
1          409  12   9      430
2           25   9   4       38
3           18   5  11       34
__all__    452  26  24      502


Overall Statistics:

Accuracy: 0.854581673307
95% CI: (0.82066366783807776, 0.88425146904760732)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.999543917913
Kappa: 0.343732091691
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2          3
Population                                   502        502        502
P: Condition positive                        430 

### ExtraTrees

In [18]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.49575847427
Accuracy: 0.840637450199
Coeficiente Kappa: 0.168874172185
Report:              precision    recall  f1-score   support

       KEEP       0.88      0.96      0.92       430
         UP       0.28      0.18      0.22        38
       DOWN       0.33      0.06      0.10        34

avg / total       0.79      0.84      0.81       502

Confusion Matrix:

Predicted    1   2  3  __all__
Actual                        
1          413  14  3      430
2           30   7  1       38
3           28   4  2       34
__all__    471  25  6      502


Overall Statistics:

Accuracy: 0.840637450199
95% CI: (0.80563300951307526, 0.87156666965292273)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.168874172185
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2           3
Population                                   502        502         502
P: Condition positive                        430         38      

### GradientBoosting

In [19]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.80996788522
Accuracy: 0.852589641434
Coeficiente Kappa: 0.365598743084
Report:              precision    recall  f1-score   support

       KEEP       0.91      0.94      0.93       430
         UP       0.35      0.37      0.36        38
       DOWN       0.47      0.24      0.31        34

avg / total       0.84      0.85      0.84       502

Confusion Matrix:

Predicted    1   2   3  __all__
Actual                         
1          406  17   7      430
2           22  14   2       38
3           17   9   8       34
__all__    445  40  17      502


Overall Statistics:

Accuracy: 0.852589641434
95% CI: (0.81851083603014907, 0.88244503760679804)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.991385188156
Kappa: 0.365598743084
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2          3
Population                                   502        502        502
P: Condition positive                        430  

### AdaBoost

In [20]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.08663940731
Accuracy: 0.832669322709
Coeficiente Kappa: 0.2335320634
Report:              precision    recall  f1-score   support

       KEEP       0.89      0.94      0.91       430
         UP       0.24      0.21      0.22        38
       DOWN       0.50      0.21      0.29        34

avg / total       0.81      0.83      0.82       502

Confusion Matrix:

Predicted    1   2   3  __all__
Actual                         
1          403  22   5      430
2           28   8   2       38
3           23   4   7       34
__all__    454  34  14      502


Overall Statistics:

Accuracy: 0.832669322709
95% CI: (0.79708302781739193, 0.86427866639521944)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.999999808346
Kappa: 0.2335320634
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        1          2          3
Population                                   502        502        502
P: Condition positive                        430      

### VotingClassifier

In [None]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


Accuracy: 0.846613545817
Coeficiente Kappa: 0.243383964923
Report:              precision    recall  f1-score   support

       KEEP       0.89      0.96      0.92       430
         UP       0.35      0.24      0.28        38
       DOWN       0.42      0.15      0.22        34

avg / total       0.81      0.85      0.82       502

Confusion Matrix:

Predicted    1   2   3  __all__
Actual                         
1          411  13   6      430
2           28   9   1       38
3           25   4   5       34
__all__    464  26  12      502


Overall Statistics:

Accuracy: 0.846613545817
95% CI: (0.8120637086763679, 0.87701419388967428)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.99999999849
Kappa: 0.243383964923
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                       1          2          3
Population                                  502        502        502
P: Condition positive                       430         38         34
N: Condition 

In [None]:
# train
v = VotingClassifier(estimators=[
        ('rf', rf), ('ba', ba), ('et', et), ('gr', gr), ('ab', ab)
    ], voting='hard', n_jobs=-1)
model1 = v.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
# y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

  **self._backend_args)


### Conclusions

RandomForest and GradientBoosting are working very good. We need study them a few more...