In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data



### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [2]:
# get_data.get('data/datas.csv', period='Hourly', market='bitstampUSD')

### Load Data

In [3]:
df = pd.read_csv('data/datas.csv', sep=',')

In [4]:
df.shape

(50756, 8)

### Preprocessing

In [5]:
df = utils.dropna(df)

In [6]:
df.shape

(45005, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [7]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [8]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 45005, Number of columns: 9
Number of UP rows: 1736, Number of DOWN rows: 1706


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [9]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [10]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [11]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df['Cumulative_return'] = (df['Close'] / df['Close'][0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'][0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [12]:
# technical analysis (price and volume)

# momentum
df['Momentum_3'] = (df['Close'] / df['Close'].shift(3)) - 1
df['Momentum_3'] = df['Momentum_3'].shift(-1)

df['Momentum_5'] = (df['Close'] / df['Close'].shift(5)) - 1
df['Momentum_5'] = df['Momentum_5'].shift(-1)

df['Momentum_7'] = (df['Close'] / df['Close'].shift(7)) - 1
df['Momentum_7'] = df['Momentum_7'].shift(-1)

df['Momentum_10'] = (df['Close'] / df['Close'].shift(10)) - 1
df['Momentum_10'] = df['Momentum_10'].shift(-1)

# rollings: https://github.com/pandas-dev/pandas/blob/master/pandas/stats/moments.py
df['Rolling_mean_3'] = df.set_index('Date')['Close'].rolling(window=3).mean().values
df['Rolling_std_3'] = df.set_index('Date')['Close'].rolling(window=3).std().values
df['Rolling_cov_3'] = df.set_index('Date')['Close'].rolling(window=3).cov().values

df['Rolling_mean_3'] = df['Rolling_mean_3'].shift(-1)
df['Rolling_std_3'] = df['Rolling_std_3'].shift(-1)
df['Rolling_cov_3'] = df['Rolling_cov_3'].shift(-1)

df['Rolling_mean_5'] = df.set_index('Date')['Close'].rolling(window=5).mean().values
df['Rolling_std_5'] = df.set_index('Date')['Close'].rolling(window=5).std().values
df['Rolling_cov_5'] = df.set_index('Date')['Close'].rolling(window=5).cov().values

df['Rolling_mean_5'] = df['Rolling_mean_5'].shift(-1)
df['Rolling_std_5'] = df['Rolling_std_5'].shift(-1)
df['Rolling_cov_5'] = df['Rolling_cov_5'].shift(-1)

df['Rolling_mean_10'] = df.set_index('Date')['Close'].rolling(window=10).mean().values
df['Rolling_std_10'] = df.set_index('Date')['Close'].rolling(window=10).std().values
df['Rolling_cov_10'] = df.set_index('Date')['Close'].rolling(window=10).cov().values

df['Rolling_mean_10'] = df['Rolling_mean_10'].shift(-1)
df['Rolling_std_10'] = df['Rolling_std_10'].shift(-1)
df['Rolling_cov_10'] = df['Rolling_cov_10'].shift(-1)

# bollinger bands

df['Bollinger_band_mean_3_max'] = df['Rolling_mean_3'] + (2*df['Rolling_std_3'])
df['Bollinger_band_mean_3_min'] = df['Rolling_mean_3'] - (2*df['Rolling_std_3'])

df['Bollinger_band_mean_5_max'] = df['Rolling_mean_5'] + (2*df['Rolling_std_5'])
df['Bollinger_band_mean_5_min'] = df['Rolling_mean_5'] - (2*df['Rolling_std_5'])

df['Bollinger_band_mean_10_max'] = df['Rolling_mean_10'] + (2*df['Rolling_std_10'])
df['Bollinger_band_mean_10_min'] = df['Rolling_mean_10'] - (2*df['Rolling_std_10'])


In [13]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 45005, Number of columns: 45


In [14]:
# shift
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    df[col] = df[col].shift(-1)
df = df.dropna()

In [15]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44995, Number of columns: 45


### Split

In [16]:
train, test = utils.split_df2(df)

In [17]:
excl = ['Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

# xgboost

In [18]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)

#num_boost_rounds = 756

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

y_pred = model.predict(dtest)
y_true = test['Target']

# utils.metrics(y_true, y_pred)
utils.metrics2(y_true, y_pred)

[0]	train-mlogloss:1.09239	test-mlogloss:1.09272
[50]	train-mlogloss:0.834179	test-mlogloss:0.849465
[100]	train-mlogloss:0.650097	test-mlogloss:0.677785
[150]	train-mlogloss:0.513633	test-mlogloss:0.551788
[200]	train-mlogloss:0.409891	test-mlogloss:0.457024
[250]	train-mlogloss:0.329746	test-mlogloss:0.38446
[300]	train-mlogloss:0.267055	test-mlogloss:0.3284
[350]	train-mlogloss:0.217569	test-mlogloss:0.284712
[400]	train-mlogloss:0.178243	test-mlogloss:0.250568
[450]	train-mlogloss:0.146843	test-mlogloss:0.223758
[500]	train-mlogloss:0.121627	test-mlogloss:0.202627
[550]	train-mlogloss:0.101318	test-mlogloss:0.186029
[600]	train-mlogloss:0.084881	test-mlogloss:0.172943
[650]	train-mlogloss:0.0715383	test-mlogloss:0.16269
[700]	train-mlogloss:0.0606653	test-mlogloss:0.154631
[750]	train-mlogloss:0.0517803	test-mlogloss:0.148338
[800]	train-mlogloss:0.044478	test-mlogloss:0.143419
[850]	train-mlogloss:0.038464	test-mlogloss:0.139635
[900]	train-mlogloss:0.0334903	test-mlogloss:0.13673

In [19]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_5', 52),
 ('Rolling_cov_3', 589),
 ('Year', 2404),
 ('Weighted_Price', 3024),
 ('Rolling_mean_5', 4682),
 ('Close', 4705),
 ('Rolling_mean_3', 5230),
 ('Bollinger_band_mean_3_max', 5459),
 ('Bollinger_band_mean_5_max', 6822),
 ('High', 9150),
 ('Rolling_mean_10', 10699),
 ('Low', 10929),
 ('Bollinger_band_mean_5_min', 11618),
 ('Bollinger_band_mean_3_min', 13842),
 ('yearmonth', 14256),
 ('Bollinger_band_mean_10_max', 16137),
 ('yearweek', 19218),
 ('Bollinger_band_mean_10_min', 19476),
 ('Month', 20329),
 ('yearweekday', 26745),
 ('Weekday', 31299),
 ('Week', 35116),
 ('Volume_Currency', 47450),
 ('Open', 52526),
 ('Rolling_std_5', 59523),
 ('High-low', 60658),
 ('Day', 65182),
 ('Rolling_std_10', 66400),
 ('Volume_BTC', 80068),
 ('Momentum_7', 88223),
 ('Momentum_10', 89828),
 ('Close-open', 90627),
 ('Momentum_5', 96923),
 ('Daily_return', 126629),
 ('Momentum_3', 129144),
 ('Rolling_std_3', 157556)]

In [20]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

[('Rolling_cov_3', 0.44464253717657054),
 ('Weekday', 0.47395862513147397),
 ('Day', 0.4744266097092883),
 ('Week', 0.5193313165690181),
 ('Volume_BTC', 0.5422115800206447),
 ('Month', 0.5560869329059716),
 ('Rolling_cov_5', 0.5565997423076924),
 ('Year', 0.5593143460420117),
 ('Volume_Currency', 0.5829833221469765),
 ('Momentum_5', 0.6263700547213984),
 ('Momentum_10', 0.6265618135609475),
 ('yearweekday', 0.6358946635371938),
 ('Momentum_7', 0.6384458416022141),
 ('Rolling_std_10', 0.689176584933971),
 ('High-low', 0.7309376197528615),
 ('Bollinger_band_mean_3_max', 0.7546737668365984),
 ('Rolling_std_5', 0.8007603315860689),
 ('Low', 0.8649893041525225),
 ('Rolling_mean_3', 0.8674518708890959),
 ('Bollinger_band_mean_5_max', 0.891471700265174),
 ('Close', 0.8921436513496293),
 ('Bollinger_band_mean_10_max', 0.925914805810691),
 ('Open', 0.9275109546808378),
 ('Rolling_mean_5', 0.9647310600501906),
 ('yearweek', 1.0084876061404349),
 ('Weighted_Price', 1.0138816069678223),
 ('High', 