In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data



### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [2]:
# get_data.get('data/datas.csv', period='6-hour', market='bitstampUSD')

### Load Data

In [3]:
df = pd.read_csv('data/datas.csv', sep=',')

In [4]:
df.shape

(50756, 8)

### Preprocessing

Drop rows with read problems (0.00, 2.7e+308) values

In [5]:
df = utils.dropna(df)

In [6]:
df.shape

(45005, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [7]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [8]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 45005, Number of columns: 9
Number of UP rows: 1736, Number of DOWN rows: 1706


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [9]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [10]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [11]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df['Cumulative_return'] = (df['Close'] / df['Close'][0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'][0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [12]:
# technical analysis (price and volume)

# momentum
df['Momentum_3'] = (df['Close'] / df['Close'].shift(3)) - 1
df['Momentum_3'] = df['Momentum_3'].shift(-1)

df['Momentum_5'] = (df['Close'] / df['Close'].shift(5)) - 1
df['Momentum_5'] = df['Momentum_5'].shift(-1)

df['Momentum_7'] = (df['Close'] / df['Close'].shift(7)) - 1
df['Momentum_7'] = df['Momentum_7'].shift(-1)

df['Momentum_10'] = (df['Close'] / df['Close'].shift(10)) - 1
df['Momentum_10'] = df['Momentum_10'].shift(-1)

# rollings: https://github.com/pandas-dev/pandas/blob/master/pandas/stats/moments.py
df['Rolling_mean_3'] = df.set_index('Date')['Close'].rolling(window=3).mean().values
df['Rolling_std_3'] = df.set_index('Date')['Close'].rolling(window=3).std().values
df['Rolling_cov_3'] = df.set_index('Date')['Close'].rolling(window=3).cov().values

df['Rolling_mean_3'] = df['Rolling_mean_3'].shift(-1)
df['Rolling_std_3'] = df['Rolling_std_3'].shift(-1)
df['Rolling_cov_3'] = df['Rolling_cov_3'].shift(-1)

df['Rolling_mean_5'] = df.set_index('Date')['Close'].rolling(window=5).mean().values
df['Rolling_std_5'] = df.set_index('Date')['Close'].rolling(window=5).std().values
df['Rolling_cov_5'] = df.set_index('Date')['Close'].rolling(window=5).cov().values

df['Rolling_mean_5'] = df['Rolling_mean_5'].shift(-1)
df['Rolling_std_5'] = df['Rolling_std_5'].shift(-1)
df['Rolling_cov_5'] = df['Rolling_cov_5'].shift(-1)

df['Rolling_mean_10'] = df.set_index('Date')['Close'].rolling(window=10).mean().values
df['Rolling_std_10'] = df.set_index('Date')['Close'].rolling(window=10).std().values
df['Rolling_cov_10'] = df.set_index('Date')['Close'].rolling(window=10).cov().values

df['Rolling_mean_10'] = df['Rolling_mean_10'].shift(-1)
df['Rolling_std_10'] = df['Rolling_std_10'].shift(-1)
df['Rolling_cov_10'] = df['Rolling_cov_10'].shift(-1)

# bollinger bands
df['Bollinger_band_mean_3_max'] = df['Rolling_mean_3'] + (2*df['Rolling_std_3'])
df['Bollinger_band_mean_3_min'] = df['Rolling_mean_3'] - (2*df['Rolling_std_3'])

df['Bollinger_band_mean_5_max'] = df['Rolling_mean_5'] + (2*df['Rolling_std_5'])
df['Bollinger_band_mean_5_min'] = df['Rolling_mean_5'] - (2*df['Rolling_std_5'])

df['Bollinger_band_mean_10_max'] = df['Rolling_mean_10'] + (2*df['Rolling_std_10'])
df['Bollinger_band_mean_10_min'] = df['Rolling_mean_10'] - (2*df['Rolling_std_10'])

In [13]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 45005, Number of columns: 45


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [14]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df[col].shift(prev)
df = df.dropna()

In [15]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44993, Number of columns: 195


### Split

In [16]:
train, test = utils.split_df2(df)

In [17]:
excl = ['Open', 'High', 'Low', 'Close','Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

### RandomForest

In [18]:
# train
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.175513167987
Accuracy: 0.929415948084
Coefficient Kappa: 0.308477927781
Report:              precision    recall  f1-score   support

       KEEP       0.94      0.99      0.96     10395
         UP       0.54      0.21      0.30       433
       DOWN       0.52      0.20      0.29       421

avg / total       0.91      0.93      0.91     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0          10279   55   61    10395
1            325   90   18      433
2            313   22   86      421
__all__    10917  167  165    11249


Overall Statistics:

Accuracy: 0.929415948084
95% CI: (0.92452626966019869, 0.93408194276513079)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.308477927781
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0           1          2
Population                                 11249       11249      11249
P: Condition positive             

### Bagging

In [19]:
# train
ba = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = ba.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.172954521395
Accuracy: 0.932882922926
Coefficient Kappa: 0.411886112056
Report:              precision    recall  f1-score   support

       KEEP       0.95      0.98      0.97     10395
         UP       0.55      0.31      0.40       433
       DOWN       0.56      0.32      0.41       421

avg / total       0.92      0.93      0.92     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0          10225   87   83    10395
1            277  135   21      433
2            262   25  134      421
__all__    10764  247  238    11249


Overall Statistics:

Accuracy: 0.932882922926
95% CI: (0.92810137187666164, 0.93743891950004699)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.411886112056
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0          1           2
Population                                 11249      11249       11249
P: Condition positive             

### ExtraTrees

In [20]:
# train
et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=17, verbose=0)
model1 = et.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.191833408217
Accuracy: 0.925593386079
Coefficient Kappa: 0.202061869824
Report:              precision    recall  f1-score   support

       KEEP       0.94      0.99      0.96     10395
         UP       0.48      0.12      0.19       433
       DOWN       0.42      0.12      0.18       421

avg / total       0.90      0.93      0.90     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0          10313   33   49    10395
1            365   50   18      433
2            351   21   49      421
__all__    11029  104  116    11249


Overall Statistics:

Accuracy: 0.925593386079
95% CI: (0.92058830902213606, 0.93037683541563532)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.202061869824
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         0          1           2
Population                                  11249      11249       11249
P: Condition positive           

### GradientBoosting

In [21]:
# train
gr = GradientBoostingClassifier(n_estimators=1000, max_depth=10, warm_start=True, learning_rate=0.08, random_state=17, verbose=0)
model1 = gr.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 0.28010055284
Accuracy: 0.932882922926
Coefficient Kappa: 0.421087649685
Report:              precision    recall  f1-score   support

       KEEP       0.95      0.98      0.97     10395
         UP       0.58      0.31      0.41       433
       DOWN       0.53      0.34      0.42       421

avg / total       0.92      0.93      0.92     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0          10213   79  103    10395
1            273  136   24      433
2            255   21  145      421
__all__    10741  236  272    11249


Overall Statistics:

Accuracy: 0.932882922926
95% CI: (0.92810137187666164, 0.93743891950004699)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.421087649685
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0           1          2
Population                                 11249       11249      11249
P: Condition positive              

### AdaBoost

In [22]:
# train
ab = AdaBoostClassifier(n_estimators=500, random_state=17)
model1 = ab.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_pred_proba = model1.predict_proba(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred, y_pred_proba)

Cross Entropy: 1.08680562821
Accuracy: 0.917948262068
Coefficient Kappa: 0.407418414088
Report:              precision    recall  f1-score   support

       KEEP       0.96      0.96      0.96     10395
         UP       0.44      0.39      0.41       433
       DOWN       0.41      0.39      0.40       421

avg / total       0.92      0.92      0.92     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0           9991  188  216    10395
1            240  169   24      433
2            224   31  166      421
__all__    10455  388  406    11249


Overall Statistics:

Accuracy: 0.917948262068
95% CI: (0.91272333584377663, 0.92295565507517552)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.999998419814
Kappa: 0.407418414088
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        0          1          2
Population                                 11249      11249      11249
P: Condition positive     

### linear models

In [25]:
from sklearn.linear_model import *

In [26]:
# train
rf = LogisticRegression(max_iter=2000)
model1 = rf.fit(train[cols], train['Target'])

# predict
y_pred = model1.predict(test[cols])
y_true = test['Target']

# metric
utils.metrics(y_true, y_pred)

Accuracy: 0.934394168371
Coefficient Kappa: 0.317203847389
Report:              precision    recall  f1-score   support

       KEEP       0.94      0.99      0.97     10395
         UP       0.75      0.22      0.34       433
       DOWN       0.68      0.21      0.32       421

avg / total       0.92      0.93      0.92     11249

Confusion Matrix:

Predicted      0    1    2  __all__
Actual                             
0          10328   30   37    10395
1            335   94    4      433
2            330    2   89      421
__all__    10993  126  130    11249


Overall Statistics:

Accuracy: 0.934394168371
95% CI: (0.92966084282523398, 0.938901122021359)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.0
Kappa: 0.317203847389
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                         0           1           2
Population                                  11249       11249       11249
P: Condition positive                       10395         433 