In [177]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [178]:
# get_data.get('data/datas.csv', period='Hourly', market='bitstampUSD')

### Load Data

In [179]:
df = pd.read_csv('data/datas.csv', sep=',')

In [180]:
df.shape

(50756, 8)

### Preprocessing

In [181]:
df = utils.dropna(df)

In [182]:
df.shape

(45005, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [183]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [184]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 45005, Number of columns: 9
Number of UP rows: 1736, Number of DOWN rows: 1706


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [185]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [186]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [187]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df['Cumulative_return'] = (df['Close'] / df['Close'][0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'][0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [188]:
# technical analysis

# momentum
for idx in range(9):
    m = idx+2
    df['Momentum_'+str(m)] = ((df['Close'] / df['Close'].shift(m)) - 1).shift(-1)

# rollings
for idx in range(9):
    m = idx+2
    df['Rolling_mean_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).mean()).shift(-1).values
    df['Rolling_std_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).std()).shift(-1).values
    df['Rolling_cov_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).cov()).shift(-1).values
    

# bollinger bands
for idx in range(9):
    m = idx+2
    df['Bollinger_band_mean_'+str(m)+'_max'] = df['Rolling_mean_'+str(m)] + (2*df['Rolling_std_'+str(m)])
    df['Bollinger_band_mean_'+str(m)+'_min'] = df['Rolling_mean_'+str(m)] - (2*df['Rolling_std_'+str(m)])

In [189]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 45005, Number of columns: 80


In [190]:
# shift
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    df[col] = df[col].shift(-1)
df = df.dropna()

In [191]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44995, Number of columns: 80


### Split

In [193]:
train, test = utils.split_df2(df)

In [194]:
excl = ['Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

# xgboost

In [195]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

"""
cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=10,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
"""

num_boost_rounds = 1300

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

# predict
y_pred = model.predict(dtest)
y_true = test['Target']

utils.metrics2(y_true, y_pred)

1300
Accuracy: 0.973330962752
Coefficient Kappa: 0.802703881133
Confussion Matrix:
[[10299    41    41]
 [   99   316     2]
 [  113     4   334]]


In [196]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_5', 55),
 ('Rolling_cov_6', 427),
 ('Rolling_cov_3', 914),
 ('Rolling_mean_3', 947),
 ('Rolling_mean_8', 1031),
 ('Rolling_mean_7', 1085),
 ('Weighted_Price', 1115),
 ('Rolling_mean_9', 1141),
 ('Rolling_mean_2', 1156),
 ('Rolling_mean_6', 1168),
 ('Rolling_mean_5', 1188),
 ('Rolling_mean_4', 1190),
 ('Rolling_cov_4', 1290),
 ('Year', 1301),
 ('Bollinger_band_mean_3_max', 1382),
 ('Rolling_mean_10', 1399),
 ('Bollinger_band_mean_8_max', 1494),
 ('Bollinger_band_mean_6_max', 1501),
 ('Bollinger_band_mean_5_max', 1515),
 ('Bollinger_band_mean_7_max', 1557),
 ('Close', 1619),
 ('Bollinger_band_mean_4_max', 1840),
 ('Bollinger_band_mean_9_max', 1900),
 ('Bollinger_band_mean_9_min', 1909),
 ('Bollinger_band_mean_7_min', 2008),
 ('Bollinger_band_mean_8_min', 2016),
 ('Bollinger_band_mean_10_max', 2301),
 ('Bollinger_band_mean_6_min', 2425),
 ('Bollinger_band_mean_5_min', 2486),
 ('Bollinger_band_mean_3_min', 2938),
 ('Bollinger_band_mean_4_min', 2979),
 ('High', 3112),
 ('Boll

In [197]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_4', 0.39266235447906944),
 ('Rolling_cov_6', 0.4513896416393439),
 ('Weekday', 0.453303677587354),
 ('Day', 0.46133882101974377),
 ('Week', 0.4803043029314136),
 ('Momentum_10', 0.5191047701491791),
 ('Momentum_7', 0.557125616952049),
 ('Momentum_8', 0.5609530360168099),
 ('Momentum_9', 0.5619482308869986),
 ('Month', 0.5679134873774199),
 ('Rolling_cov_5', 0.5937280381818182),
 ('Momentum_5', 0.60244104824803),
 ('Momentum_4', 0.6046979886981056),
 ('Momentum_6', 0.6245873493559033),
 ('Volume_BTC', 0.6412417413736236),
 ('Close', 0.6439677527010506),
 ('Volume_Currency', 0.6717289335925942),
 ('Bollinger_band_mean_10_max', 0.6782972680834426),
 ('Bollinger_band_mean_3_max', 0.6901914091780026),
 ('Bollinger_band_mean_6_max', 0.701491961056627),
 ('Momentum_3', 0.7248471858948418),
 ('Rolling_std_10', 0.7366679822245462),
 ('Rolling_cov_2', 0.739530539351732),
 ('yearweekday', 0.7612386077945933),
 ('Open', 0.7632939947175461),
 ('Rolling_std_9', 0.767130160289881),
 ('