In [115]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [116]:
# get_data.get('data/datas.csv', period='Hourly', market='bitstampUSD')

### Load Data

In [117]:
df = pd.read_csv('data/datas.csv', sep=',')

In [118]:
df.shape

(50689, 8)

### Preprocessing

In [119]:
df = utils.dropna(df)

In [120]:
df.shape

(44938, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [121]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [122]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 44938, Number of columns: 9
Number of UP rows: 1731, Number of DOWN rows: 1699


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [123]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [124]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [125]:
# Fundamental analysis
# daily return

df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return

df['Cumulative_return'] = (df['Close'] / df['Close'][0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'][0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [126]:
# technical analysis (price and volume)

# momentum
df['Momentum_3'] = (df['Close'] / df['Close'].shift(3)) - 1
df['Momentum_3'] = df['Momentum_3'].shift(-1)

df['Momentum_5'] = (df['Close'] / df['Close'].shift(5)) - 1
df['Momentum_5'] = df['Momentum_5'].shift(-1)

df['Momentum_7'] = (df['Close'] / df['Close'].shift(7)) - 1
df['Momentum_7'] = df['Momentum_7'].shift(-1)

df['Momentum_10'] = (df['Close'] / df['Close'].shift(10)) - 1
df['Momentum_10'] = df['Momentum_10'].shift(-1)

# rollings: https://github.com/pandas-dev/pandas/blob/master/pandas/stats/moments.py
df['Rolling_mean_3'] = df.set_index('Date')['Close'].rolling(window=3).mean().values
df['Rolling_std_3'] = df.set_index('Date')['Close'].rolling(window=3).std().values
df['Rolling_cov_3'] = df.set_index('Date')['Close'].rolling(window=3).cov().values

df['Rolling_mean_3'] = df['Rolling_mean_3'].shift(-1)
df['Rolling_std_3'] = df['Rolling_std_3'].shift(-1)
df['Rolling_cov_3'] = df['Rolling_cov_3'].shift(-1)

df['Rolling_mean_5'] = df.set_index('Date')['Close'].rolling(window=5).mean().values
df['Rolling_std_5'] = df.set_index('Date')['Close'].rolling(window=5).std().values
df['Rolling_cov_5'] = df.set_index('Date')['Close'].rolling(window=5).cov().values

df['Rolling_mean_5'] = df['Rolling_mean_5'].shift(-1)
df['Rolling_std_5'] = df['Rolling_std_5'].shift(-1)
df['Rolling_cov_5'] = df['Rolling_cov_5'].shift(-1)

df['Rolling_mean_10'] = df.set_index('Date')['Close'].rolling(window=10).mean().values
df['Rolling_std_10'] = df.set_index('Date')['Close'].rolling(window=10).std().values
df['Rolling_cov_10'] = df.set_index('Date')['Close'].rolling(window=10).cov().values

df['Rolling_mean_10'] = df['Rolling_mean_10'].shift(-1)
df['Rolling_std_10'] = df['Rolling_std_10'].shift(-1)
df['Rolling_cov_10'] = df['Rolling_cov_10'].shift(-1)

# TODO: bollinger bands
"""
df['Bollinger_band_mean_10'] = (df['Close'] - df['Rolling_mean_10']) / 2 * df['Close'].std()
df['Bollinger_band_mean_10_'] = (df['Close'] + df['Rolling_mean_10']) / 2 * df['Close'].std()
"""

"\ndf['Bollinger_band_mean_10'] = (df['Close'] - df['Rolling_mean_10']) / 2 * df['Close'].std()\ndf['Bollinger_band_mean_10_'] = (df['Close'] + df['Rolling_mean_10']) / 2 * df['Close'].std()\n"

In [127]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44938, Number of columns: 39


Transformation previous values from Open, High, Low, Close, Volume and Weighted columns.

In [128]:
# create PREV_DAYS * len(cols) new columns
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    for idx in range(settings.PREV_DAYS):
        prev = idx + 1
        df[col+'-'+str(prev)] = df[col].shift(prev)
df = df.dropna()

In [129]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44926, Number of columns: 189


### Split

In [113]:
train, test = utils.split_df2(df)

In [114]:
excl = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'Target', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

# xgboost

In [50]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)

#num_boost_rounds = 756

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

y_pred = model.predict(dtest)
y_true = test['Target']

# utils.metrics(y_true, y_pred)
utils.metrics2(y_true, y_pred)

[0]	train-mlogloss:1.09242	test-mlogloss:1.09288
[50]	train-mlogloss:0.835044	test-mlogloss:0.856424
[100]	train-mlogloss:0.651387	test-mlogloss:0.690087
[150]	train-mlogloss:0.514989	test-mlogloss:0.568514
[200]	train-mlogloss:0.411262	test-mlogloss:0.477475
[250]	train-mlogloss:0.331009	test-mlogloss:0.408167
[300]	train-mlogloss:0.268159	test-mlogloss:0.354857
[350]	train-mlogloss:0.218496	test-mlogloss:0.313571
[400]	train-mlogloss:0.179015	test-mlogloss:0.281525
[450]	train-mlogloss:0.147458	test-mlogloss:0.256527
[500]	train-mlogloss:0.122089	test-mlogloss:0.23711
[550]	train-mlogloss:0.101623	test-mlogloss:0.221949
[600]	train-mlogloss:0.0850543	test-mlogloss:0.21022
[650]	train-mlogloss:0.071603	test-mlogloss:0.20123
[700]	train-mlogloss:0.0606417	test-mlogloss:0.194355
[750]	train-mlogloss:0.051682	test-mlogloss:0.189175
[800]	train-mlogloss:0.044342	test-mlogloss:0.185327
[850]	train-mlogloss:0.0383037	test-mlogloss:0.182549
[900]	train-mlogloss:0.0333103	test-mlogloss:0.1806

In [51]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_5', 6),
 ('Rolling_cov_3', 357),
 ('Weighted_Price-3', 461),
 ('Close-6', 465),
 ('Close-7', 523),
 ('Weighted_Price-6', 530),
 ('Close-2', 541),
 ('Close-5', 545),
 ('Close-3', 564),
 ('Close-4', 570),
 ('Weighted_Price-4', 609),
 ('Close-8', 619),
 ('Close-10', 620),
 ('Weighted_Price-5', 625),
 ('Weighted_Price-9', 641),
 ('Weighted_Price-8', 654),
 ('Close-9', 656),
 ('Weighted_Price-2', 707),
 ('Weighted_Price-7', 831),
 ('Weighted_Price-10', 1004),
 ('High-4', 1128),
 ('High-3', 1129),
 ('Weighted_Price-1', 1185),
 ('Close-1', 1409),
 ('High-6', 1419),
 ('High-5', 1472),
 ('Low-9', 1517),
 ('High-9', 1524),
 ('Low-5', 1583),
 ('High-7', 1599),
 ('High-2', 1618),
 ('High-8', 1631),
 ('Low-8', 1775),
 ('Low-7', 1799),
 ('Low-4', 1817),
 ('Year', 1966),
 ('Low-6', 2157),
 ('Low-3', 2287),
 ('High-10', 2298),
 ('High-1', 2398),
 ('Open-6', 2498),
 ('Low-2', 2518),
 ('Rolling_mean_5', 2526),
 ('Open-3', 2587),
 ('Open-4', 2641),
 ('Low-10', 2657),
 ('Open-1', 2709),
 ('

In [52]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_5', 0.5867553333333334),
 ('Year', 0.5914273452655645),
 ('Month', 0.631775167686934),
 ('Week', 0.6329468326143652),
 ('Day', 0.642162394747948),
 ('Weekday', 0.6448390450879607),
 ('Daily_return-9', 0.6744319589183089),
 ('Volume_BTC-4', 0.6829800973389932),
 ('Volume_BTC-5', 0.6836684525183547),
 ('Close-open-10', 0.6894098510282873),
 ('Volume_BTC-9', 0.6924557511432685),
 ('Volume_Currency-4', 0.6949476202143098),
 ('Close-open-9', 0.6963459396889697),
 ('Close-open-2', 0.6965284486092909),
 ('Daily_return-7', 0.7028543410259341),
 ('Daily_return-5', 0.7051262858144088),
 ('Volume_BTC-7', 0.7075545490537212),
 ('Volume_BTC-3', 0.7075619511844281),
 ('Volume_BTC-6', 0.7088344541181116),
 ('Close-open-5', 0.7134325630695919),
 ('Close-open-7', 0.7139578493138331),
 ('Volume_BTC-8', 0.7158536886382729),
 ('High-low-9', 0.7159752615105163),
 ('Close-open-8', 0.7169354794253053),
 ('Close-open-6', 0.7170496304103681),
 ('Close-open-3', 0.7231416501994189),
 ('Volume_Curr