In [157]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb

import settings
import utils
import get_data

### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['coincheckJPY', 'krakenEUR', 'bitstampUSD', 'okcoinCNY', 'btcnCNY', 'krakenUSD', 'itbitUSD', 'bitbayPLN', 'btcoidIDR', 'localbtcRUB', 'localbtcGBP', 'btcdeEUR', 'coinfloorGBP', 'localbtcUSD']

In [158]:
# get_data.get('data/datas.csv', period='Hourly', market='bitstampUSD')

### Load Data

In [159]:
df = pd.read_csv('data/datas.csv', sep=',')

In [160]:
df.shape

(50756, 8)

### Preprocessing

In [161]:
df = utils.dropna(df)

In [162]:
df.shape

(45005, 8)

### Transformation

Create column target with class [UP, KEEP, DOWN]

In [163]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [164]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 45005, Number of columns: 9
Number of UP rows: 1736, Number of DOWN rows: 1706


Create columns from Timestamp to Date, Year, Month and Day.

### Feature Engineering

In [165]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday

# extra dates
df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [166]:
df['High-low'] = df['High'] - df['Low']
df['High-low_mean'] = (df['High'] - df['Low']) / 2.0
df['Close-open'] = df['Close'] - df['Open']
df['Close-open_mean'] = (df['Close'] - df['Open']) / 2.0

In [167]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df['Cumulative_return'] = (df['Close'] / df['Close'][0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'][0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [168]:
# technical analysis

# momentum
for idx in range(9):
    m = idx+2
    df['Momentum_'+str(m)] = ((df['Close'] / df['Close'].shift(m)) - 1).shift(-1)

# rollings
for idx in range(9):
    m = idx+2
    df['Rolling_mean_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).mean()).shift(-1).values
    df['Rolling_std_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).std()).shift(-1).values
    df['Rolling_cov_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).cov()).shift(-1).values
    

# bollinger bands
for idx in range(9):
    m = idx+2
    df['Bollinger_band_mean_'+str(m)+'_max'] = df['Rolling_mean_'+str(m)] + (2*df['Rolling_std_'+str(m)])
    df['Bollinger_band_mean_'+str(m)+'_min'] = df['Rolling_mean_'+str(m)] - (2*df['Rolling_std_'+str(m)])

In [169]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 45005, Number of columns: 80


In [170]:
# shift
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price', 'High-low', 'High-low_mean', 'Close-open', 'Close-open_mean', 'Daily_return', 'Daily_return_100', 'Cumulative_return', 'Cumulative_return_100']
for col in cols:
    df[col] = df[col].shift(-1)
df = df.dropna()

In [171]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 44995, Number of columns: 80


### Split

In [172]:
train, test = utils.split_df(df)

In [173]:
excl = ['Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

# xgboost

In [None]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

"""
cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=10,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
"""

num_boost_rounds = 1300

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

# predict
y_pred = model.predict(dtest)
y_true = test['Target']

utils.metrics2(y_true, y_pred)

1300


In [None]:
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance

In [None]:
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
importance