In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb
import operator

import settings
import utils
import get_data
from ta import *



### Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['krakenEUR', 'bitstampUSD'] -> list of markets: https://bitcoincharts.com/charts/volumepie/

In [2]:
# get_data.get('data/datas.csv', period='Hourly', market='bitstampUSD')

### Load Data

In [3]:
df = pd.read_csv('data/datas.csv', sep=',')

In [4]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 52057, Number of columns: 8


### Preprocessing

In [5]:
df = utils.dropna(df)

In [6]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 46306, Number of columns: 8


### Transformation

Create column target with class [UP, KEEP, DOWN]

In [7]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [8]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))

Number of rows: 46306, Number of columns: 9
Number of UP rows: 3432, Number of DOWN rows: 3226


Create columns from Timestamp to Date, Year, Month, Hour, etc.

### Feature Engineering

In [9]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour

# extra dates
# df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
# df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
# df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [10]:
# shift
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price']
for col in cols:
    df[col] = df[col].shift(1)
df = df.dropna()

In [11]:
df['High-low'] = df['High'] - df['Low']
df['Close-open'] = df['Close'] - df['Open']

df['Up_or_Down'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP
df.loc[( df.Open + (df.Open * settings.PERCENT_UP) ) < df.Close, 'Up_or_Down'] = 1 # 'UP'
df.loc[( df.Open - (df.Open * settings.PERCENT_DOWN) ) > df.Close, 'Up_or_Down'] = 2 # 'DOWN'

df['Up_or_Down_2'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP * 2
df.loc[df.Open + (df.Open * settings.PERCENT_UP * 2 ) < df.Close, 'Up_or_Down_2'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN * 2) > df.Close, 'Up_or_Down_2'] = 2 # 'DOWN'

df['Up_or_Down_3'] = 0 # 'UP' or 'DOWN' if diff > 0
df.loc[df.Open < df.Close, 'Up_or_Down_3'] = 1 # 'UP'
df.loc[df.Open  > df.Close, 'Up_or_Down_3'] = 2 # 'DOWN'

df['Up_or_Down_4'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP / 2
df.loc[df.Open + (df.Open * settings.PERCENT_UP / 2 ) < df.Close, 'Up_or_Down_4'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN / 2) > df.Close, 'Up_or_Down_4'] = 2 # 'DOWN'

In [12]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df = df.dropna()
df['Cumulative_return'] = (df['Close'] / df['Close'].iloc[0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'].iloc[0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [13]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 46304, Number of columns: 26


# Technical Analysis

https://en.wikipedia.org/wiki/Technical_analysis

### Volume-based indicators

In [14]:
# Accumulation/Distribution index
df['Acc_Dist_Roc_BTC'] = acc_dist_roc(df, 'Volume_BTC', 2)
df['Acc_Dist_Roc_Currency'] = acc_dist_roc(df, 'Volume_Currency', 2)
df['Acc_Dist_BTC'] = acc_dist_index(df, 'Volume_BTC')
df['Acc_Dist_Currency'] = acc_dist_index(df, 'Volume_Currency')

# Chaikin Money Flow
df['Chaikin_Money_Flow_1_BTC'] = chaikin_money_flow1(df, 'Volume_BTC')
df['Chaikin_Money_Flow_2_BTC'] = chaikin_money_flow2(df, 'Volume_BTC', 20)
df['Chaikin_Money_Flow_3_BTC'] = chaikin_money_flow3(df, 'Volume_BTC', 20)
df['Chaikin_Money_Flow_1_Currency'] = chaikin_money_flow1(df, 'Volume_Currency')
df['Chaikin_Money_Flow_2_Currency'] = chaikin_money_flow2(df, 'Volume_Currency', 20)
df['Chaikin_Money_Flow_3_Currency'] = chaikin_money_flow3(df, 'Volume_Currency', 20)

# Money Flow Index
df['Money_Flow_BTC'] = money_flow_index(df, 'Volume_BTC', 14)
df['Money_Flow_Currency'] = money_flow_index(df, 'Volume_Currency', 14)

# On-balance volume
df['OBV_BTC'] = on_balance_volume(df, 'Volume_BTC')
df['OBV_BTC_mean'] = on_balance_volume_mean(df, 'Volume_BTC')
df['OBV_Currency'] = on_balance_volume(df, 'Volume_Currency')
df['OBV_Currency_mean'] = on_balance_volume_mean(df, 'Volume_Currency')

# Force Index
df['Force_Index_BTC'] = force(df, 'Volume_BTC', 2)
df['Force_Index_Currency'] = force(df, 'Volume_Currency', 2)

# delete intermediate columns
df.drop('OBV', axis=1, inplace=True)

	Series.ewm(ignore_na=False,span=3,min_periods=2,adjust=True).mean()
  return pd.Series(pd.ewma(ad, span=3, min_periods=2) - pd.ewma(ad, span=10, min_periods=9))
	Series.ewm(ignore_na=False,span=10,min_periods=9,adjust=True).mean()
  return pd.Series(pd.ewma(ad, span=3, min_periods=2) - pd.ewma(ad, span=10, min_periods=9))
	Series.rolling(window=20,center=False).mean()
  return pd.Series(pd.rolling_mean(mf, n))
	Series.rolling(window=20,center=False).sum()
  return pd.Series(moments.rolling_sum(clv*df[col_volume], n) / moments.rolling_sum(df[col_volume], n))
	Series.rolling(window=14,center=False).sum()
  n_positive_mf = pd.rolling_sum(df['1_Period_Positive_Money_Flow'], n)
	Series.rolling(window=14,center=False).sum()
  n_negative_mf = pd.rolling_sum(df['1_Period_Negative_Money_Flow'], n)
	Series.rolling(window=10,center=False).mean()
  return pd.Series(pd.rolling_mean(df['OBV'], n))


### Trend indicators

In [15]:
# Moving Average Convergence Divergence
df[['MACD', 'MACD_sign', 'MACD_diff']] = macd(df, 12, 26, 9)

# Average directional movement index
df[['ADX', 'ADX_pos', 'ADX_neg']] = adx(df, 14)

# Vortex indicator
df[['Vortex_pos', 'Vortex_neg']] = vortex(df, 14)

	Series.ewm(ignore_na=False,span=12,min_periods=25,adjust=True).mean()
  EMAfast = pd.Series(pd.ewma(df['Close'], span=n_fast, min_periods=n_slow - 1))
	Series.ewm(ignore_na=False,span=26,min_periods=25,adjust=True).mean()
  EMAslow = pd.Series(pd.ewma(df['Close'], span=n_slow, min_periods=n_slow - 1))
	Series.ewm(ignore_na=False,span=9,min_periods=8,adjust=True).mean()
  MACDsign = pd.Series(pd.ewma(MACD, span=n_sign, min_periods=n_sign - 1), name='MACD_sign_%d_%d' % (n_fast, n_slow))
	Series.rolling(window=14,center=False).sum()
  trs = pd.rolling_sum(tr, n)
	Series.rolling(window=14,center=False).sum()
  dip = 100 * pd.rolling_sum(pos, n) / trs
	Series.rolling(window=14,center=False).sum()
  din = 100 * pd.rolling_sum(neg, n) / trs
	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  adx = pd.ewma(dx, n)
	Series.rolling(window=14,center=False).sum()
  trn = moments.rolling_sum(tr, n)
	Series.rolling(window=14,center=False).sum()
  vip = moments.rolling_sum(vmp, n) 

### Momentum Indicators

In [16]:
df['RSI'] = rsi(df, 14)

	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  emaup = pd.ewma(up, n)
	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  emadn = pd.ewma(dn, n)


In [17]:
"""
for c in df.columns:
    print str(c) + u' - ' + str(df[c].isnull().sum())
"""

"\nfor c in df.columns:\n    print str(c) + u' - ' + str(df[c].isnull().sum())\n"

### Price-based indicators 

In [18]:
# Momentum
for idx in range(9):
    m = idx+2
    df['Momentum_'+str(m)] = ((df['Close'] / df['Close'].shift(m)) - 1)

# Rollings
for idx in range(9):
    m = idx+2
    df['Rolling_mean_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).mean()).values
    df['Rolling_std_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).std()).values
    df['Rolling_cov_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).cov()).values

# Bollinger bands
for idx in range(9):
    m = idx+2
    df['Bollinger_band_mean_'+str(m)+'_max'] = df['Rolling_mean_'+str(m)] + (2*df['Rolling_std_'+str(m)])
    df['Bollinger_band_mean_'+str(m)+'_min'] = df['Rolling_mean_'+str(m)] - (2*df['Rolling_std_'+str(m)])

In [19]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
df = df.dropna()
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))

Number of rows: 46304, Number of columns: 107
Number of rows: 46239, Number of columns: 107


### Split

In [20]:
train, test = utils.split_df2(df)

In [21]:
excl = ['Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

# xgboost

In [22]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.95,
    'colsample_bytree': 0.95,
    'colsample_bylevel': 0.95,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
"""
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
"""
num_boost_rounds = 705

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

# predict
y_pred = model.predict(dtest)
y_true = test['Target']

utils.metrics(y_true, y_pred)

print "\n \n \n \n \n \n ********** WEIGHT ************"
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print i
    
print "\n \n \n \n \n \n ********** GAIN ************"
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print i

705
Accuracy: 0.858044982699
Coefficient Kappa: 0.137954280761
Classification Report:
             precision    recall  f1-score   support

       KEEP       0.87      0.99      0.93      9893
         UP       0.43      0.08      0.14       860
       DOWN       0.34      0.06      0.10       807

avg / total       0.80      0.86      0.81     11560

Confussion Matrix:
[[9802   45   46]
 [ 744   71   45]
 [ 712   49   46]]

 
 
 
 
 
 ********** WEIGHT ************
('Cumulative_return_100', 21)
('Cumulative_return', 245)
('Rolling_mean_2', 775)
('Rolling_mean_9', 876)
('Up_or_Down_3', 964)
('Rolling_mean_10', 1024)
('Rolling_mean_8', 1044)
('Rolling_mean_7', 1187)
('Bollinger_band_mean_7_max', 1199)
('Rolling_mean_6', 1206)
('Rolling_mean_3', 1241)
('Rolling_mean_5', 1243)
('Weighted_Price', 1273)
('Bollinger_band_mean_8_max', 1354)
('Rolling_mean_4', 1356)
('Bollinger_band_mean_9_max', 1424)
('Bollinger_band_mean_6_max', 1555)
('Bollinger_band_mean_7_min', 1715)
('Bollinger_band_mean