In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesRegressor
import datetime

In [2]:
def date_parser(string):
    return datetime.datetime.fromtimestamp(int(string))

In [3]:
data = pd.read_csv('../../../data/coinbaseUSD.csv',names=['time','price','volume'],parse_dates=['time'],date_parser=date_parser)
print data.head()
print data.describe()

                 time  price    volume
0 2014-12-01 11:03:56    300  0.010000
1 2014-12-01 11:10:23    300  0.010000
2 2014-12-01 11:54:08    370  0.010000
3 2014-12-01 12:20:12    370  0.026556
4 2014-12-02 10:59:26    377  0.010000

[5 rows x 3 columns]


  score = values[idx]


                 price        volume
count  18192817.000000  1.819282e+07
mean        873.839092  4.123018e-01
std         762.220701  1.482153e+00
min           0.060000  1.000000e-08
25%         347.920000  1.028369e-02
50%         538.990000  5.000000e-02
75%        1142.120000  2.500010e-01
max        2999.990000  8.979820e+02

[8 rows x 2 columns]


Some data cleaning is required as the prices seem very volatilte at the beginning of the period. Also, the data shows trade prices. We need to group by the trade second to get price in each second.

Check the time difference between consecutive trades

plt.plot(data.price)
plt.xlabel('time')
plt.ylabel('price')
plt.title('Price per trade')
plt.show()

In [None]:
data = data.iloc[5000:]
print data.shape
data = data.drop_duplicates()
print data.shape

(18187817, 3)


In [None]:
print data.drop_duplicates().shape
print len(set(list(data.time)))

It seems that prices are highly affected by lagged prices over short period. we take lag till last 120 minutes of data (roughly 225 points) as regressor

In [None]:
LAG = 225
for i in range(1,LAG+1):
    var = 'price' + str(i)
    data[var] = data.price.shift(i)
data['target'] = data.price.shift(-1)    
data = data.dropna()    
data.target = data.target - data.price

In [None]:
print data.shape
print data.head(3)

In [None]:
plt.plot(data.target)
plt.xlabel('time')
plt.ylabel('price changes')
plt.title('Price changes between consecutive trades')
plt.show()

print data.target.describe()
buy_limit = np.percentile(data.target, 75)
sell_limit = np.percentile(data.target, 25)
print "buy limit is USD", buy_limit
print "sell limit is USD", sell_limit

we will predict target variable (price change between consecutive trades) using lagged prices and traded volume

In [None]:
train = data.iloc[:data.shape[0]*.75].reset_index(drop=True)
test = data.iloc[data.shape[0]*.75:].reset_index(drop=True)
del data

In [None]:
regressors1 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 225)]
regressors2 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 120)]
regressors3 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 60)]

In [None]:
clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors1],train.target)
ols = LinearRegression(normalize=True)
ols.fit(train[regressors1],train.target)

In [None]:
model1 = ExtraTreesRegressor(random_state=123,n_estimators=100,criterion='mse',verbose=1)
model1.fit(train[regressors1],train.target)

In [None]:
#clf_pred = clf.predict(test[regressors1])
#ols_pred = ols.predict(test[regressors1])
#et_pred = model1.predict(test[regressors1])

In [None]:
plt.title("Prediction")
plt.plot(clf_pred, color='lightgreen', label="Bayesian Ridge estimate")
#plt.plot(ols_pred, color='navy', linestyle='--', label="OLS estimate")
plt.plot(et_pred, color='navy', linestyle='--', label="ET estimate")
plt.plot(test.target, label = 'ground truth')
plt.xlabel("Cases")
plt.ylabel("Estimated target")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

In [None]:
clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors1],train.target)
train['pred1'] = clf.predict(train[regressors1])
test['pred1'] = clf.predict(test[regressors1])

clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors2],train.target)
train['pred2'] = clf.predict(train[regressors2])
test['pred2'] = clf.predict(test[regressors2])

clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors3],train.target)
train['pred3'] = clf.predict(train[regressors3])
test['pred3'] = clf.predict(test[regressors3])

In [None]:
new_regressors = ['volume','price','pred1','pred2','pred3']
train[new_regressors+['target']].tail(5)

We see for some periods the price difference predictions are quite accurate. Ensemble can boost performance in this case.

In [None]:
clf = BayesianRidge(compute_score=True,normalize=True,fit_intercept=True)
clf.fit(train[new_regressors],train.target)
pred = clf.predict(test[new_regressors])

In [None]:
plt.title("Prediction")
plt.plot(pred, color='lightgreen', label="Bayesian Ridge estimate")
plt.plot(test.target, label = 'ground truth')
plt.xlabel("Cases")
plt.ylabel("Estimated target")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

Lets formulate a basic trading strategy. When current position is <= 0 and price diff >= buy limit we buy. If current
position is >= 0 and price diff <= sell limit we sell. Else we hold the position.

In [None]:
def backtest(pred,actual_price,current_pos=0,cum_profit=0,current_balance=0):
    profit = []
    balance = []
    buy_count = 0
    sell_count = 0
    buys = []
    num_profit = 0
    num_loss = 0
    returns = []
    for index, i in enumerate(pred):
        if i >= buy_limit and current_pos <= 0:
            current_pos = 1
            buy_price = actual_price[index]
            buys.append(buy_price)
            current_balance -= buy_price
            buy_count += 1
        if i <= sell_limit and current_pos == 1:
            current_pos = 0
            sell_price = actual_price[index] 
            current_balance += sell_price
            cum_profit = current_balance
            sell_count += 1
            if sell_price > buy_price:
                num_profit += 1
            else:
                num_loss += 1
            returns.append((sell_price-buy_price)*1.0/buy_price)
        profit.append(cum_profit)
        balance.append(current_balance)
   
    print buy_count, sell_count
    print "max buy price in USD ", max(buys)
    print num_profit, num_loss
    print pd.DataFrame(returns).describe()
    
    return balance, profit    

In [None]:
balance, profit = backtest(pred,list(test.price))

In [None]:
plt.title("Backtesting")
plt.plot(profit, color='blue', label="Profit")
plt.plot(test.price, label = 'Bitcoin price')
plt.xlabel("Time")
plt.ylabel("Price in dollar")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

In [None]:
print "max profit at", test.time.iloc[profit.index(max(profit))], "with profit of USD ", max(profit), "and bitcoin price USD ",test.price.iloc[profit.index(max(profit))] 
print "profit at", test.time.iloc[0], "with profit of USD ", profit[0], "and bitcoin price USD ",test.price.iloc[0] 
print "profit at", test.time.iloc[-1], "with profit of USD ", profit[-1], "and bitcoin price USD ",test.price.iloc[-1] 

We see some profit over time. Net profit is 47$ (20%) for a single bitcoin trading over the trading period of one day. 