In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
import datetime

In [115]:
def date_parser(string):
    return datetime.datetime.fromtimestamp(int(string))

In [116]:
data = pd.read_csv('../data/coinbaseUSD.csv',names=['time','price','volume'],nrows=25000,parse_dates=['time'],date_parser=date_parser)
print data.head()
print data.describe()

                 time  price    volume
0 2014-12-01 11:03:56    300  0.010000
1 2014-12-01 11:10:23    300  0.010000
2 2014-12-01 11:54:08    370  0.010000
3 2014-12-01 12:20:12    370  0.026556
4 2014-12-02 10:59:26    377  0.010000

[5 rows x 3 columns]
              price        volume
count  25000.000000  25000.000000
mean     245.605850      0.761495
std       29.355819      3.480678
min      109.870000      0.000000
25%      234.860000      0.010000
50%      240.020000      0.103950
75%      261.622500      0.616225
max      398.000000    208.258900

[8 rows x 2 columns]


Check the time difference between consecutive trades

In [117]:
alltimes = list(data.time)
timediffs = pd.DataFrame([(t - s).seconds for s, t in zip(alltimes, alltimes[1:])])
timediffs.describe()

Unnamed: 0,0
count,24999.0
mean,71.588064
std,1604.348635
min,0.0
25%,0.0
50%,2.0
75%,15.0
max,86000.0


In [118]:
plt.plot(data.price)
plt.xlabel('time')
plt.ylabel('price')
plt.title('Price per trade')
plt.show()

In [119]:
plt.scatter(data.volume*100,data.price)
plt.xlabel('trade volume')
plt.ylabel('price')
plt.title('Price per trade volume')
plt.show()

It seems that prices are highly affected by lagged prices over short period. we take lag till last 120 minutes of data (roughly 225 points) as regressor

In [120]:
LAG = 225
for i in range(1,LAG+1):
    var = 'price' + str(i)
    data[var] = data.price.shift(i)
data['target'] = data.price.shift(-1)    
data = data.dropna()    
data.target = data.target - data.price

In [121]:
print data.shape
print data.head(3)

(24774, 229)
                   time   price  volume  price1  price2  price3  price4  \
225 2015-01-08 08:17:03  273.70    0.01  275.67  311.56  297.48  294.47   
226 2015-01-08 08:17:27  302.61    0.01  273.70  275.67  311.56  297.48   
227 2015-01-08 08:17:50  344.06    0.01  302.61  273.70  275.67  311.56   

     price5  price6  price7  price8  price9  price10  price11  price12  \
225  328.21  301.01  320.94  330.98  308.63   306.42   346.20   305.78   
226  294.47  328.21  301.01  320.94  330.98   308.63   306.42   346.20   
227  297.48  294.47  328.21  301.01  320.94   330.98   308.63   306.42   

     price13  price14  price15  price16  price17      
225   291.96   348.43   326.34   272.87   317.23 ...  
226   305.78   291.96   348.43   326.34   272.87 ...  
227   346.20   305.78   291.96   348.43   326.34 ...  

[3 rows x 229 columns]


In [122]:
plt.plot(data.target)
plt.xlabel('time')
plt.ylabel('price changes')
plt.title('Price changes between consecutive trades')
plt.show()

data.target.describe()
buy_limit = np.percentile(data.target, 75)
sell_limit = np.percentile(data.target, 25)
print "buy limit is USD", buy_limit
print "sell limit is USD", sell_limit

buy limit is USD 0.04
sell limit is USD -0.02


we will predict target variable (price change between consecutive trades) using lagged prices and traded volume

In [123]:
train = data.iloc[:data.shape[0]*.75].reset_index(drop=True)
test = data.iloc[data.shape[0]*.75:].reset_index(drop=True)
del data

In [124]:
regressors1 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 225)]
regressors2 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 120)]
regressors3 = [i for i in train.columns if i == 'price' or ('price' in i and int(i.replace('price','')) <= 60)]

In [125]:
clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors1],train.target)
ols = LinearRegression(normalize=True)
ols.fit(train[regressors1],train.target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [126]:
clf_pred1 = clf.predict(test[regressors1])
ols_pred = ols.predict(test[regressors1])

In [127]:
plt.title("Prediction")
plt.plot(clf_pred, color='lightgreen', label="Bayesian Ridge estimate")
plt.plot(ols_pred, color='navy', linestyle='--', label="OLS estimate")
plt.plot(test.target, label = 'ground truth')
plt.xlabel("Cases")
plt.ylabel("Estimated target")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

In [128]:
clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors1],train.target)
train['pred1'] = clf.predict(train[regressors1])
test['pred1'] = clf.predict(test[regressors1])

clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors2],train.target)
train['pred2'] = clf.predict(train[regressors2])
test['pred2'] = clf.predict(test[regressors2])

clf = BayesianRidge(compute_score=True,normalize=True)
clf.fit(train[regressors3],train.target)
train['pred3'] = clf.predict(train[regressors3])
test['pred3'] = clf.predict(test[regressors3])

In [129]:
new_regressors = ['volume','price','pred1','pred2','pred3']
train[new_regressors+['target']].head(5)

Unnamed: 0,volume,price,pred1,pred2,pred3,target
0,0.01,273.7,27.691293,32.08908,31.499489,28.91
1,0.01,302.61,2.335279,-5.120105,-8.282582,41.45
2,0.01,344.06,-30.65225,-28.977956,-26.765174,-20.3
3,0.01,323.76,-12.573488,-18.829805,-16.423889,18.37
4,0.01,342.13,-15.947387,-22.363465,-28.202634,-19.88


We see for some periods the price difference predictions are quite accurate. Ensemble can boost performance in this case.

In [130]:
clf = BayesianRidge(compute_score=True,normalize=True,fit_intercept=True)
clf.fit(train[new_regressors],train.target)
pred = clf.predict(test[new_regressors])

In [131]:
plt.title("Prediction")
plt.plot(pred, color='lightgreen', label="Bayesian Ridge estimate")
plt.plot(test.target, label = 'ground truth')
plt.xlabel("Cases")
plt.ylabel("Estimated target")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

Lets formulate a basic trading strategy. When current position is <= 0 and price diff >= buy limit we buy. If current
position is >= 0 and price diff <= sell limit we sell. Else we hold the position.

In [132]:
def backtest(pred,actual_price,current_pos=0,cum_profit=0,current_balance=0):
    profit = []
    balance = []
    for index, i in enumerate(pred):
        if i >= buy_limit and current_pos <= 0:
            current_pos = 1
            buy_price = actual_price[index]
            current_balance -= buy_price
        if i <= sell_limit and current_pos == 1:
            current_pos = 0
            sell_price = actual_price[index] 
            current_balance += sell_price
            cum_profit = current_balance
        profit.append(cum_profit)
        balance.append(current_balance)
    return balance, profit    

In [133]:
balance, profit = backtest(pred,list(test.price))

In [134]:
plt.title("Backtesting")
plt.plot(profit, color='blue', label="Profit")
plt.plot(test.price, label = 'Bitcoin price')
plt.xlabel("Time")
plt.ylabel("Price in dollar")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

In [135]:
print "max profit at", test.time.iloc[profit.index(max(profit))], "with profit of USD ", max(profit), "and bitcoin price USD ",test.price.iloc[profit.index(max(profit))] 
print "profit at", test.time.iloc[0], "with profit of USD ", profit[0], "and bitcoin price USD ",test.price.iloc[0] 
print "profit at", test.time.iloc[-1], "with profit of USD ", profit[-1], "and bitcoin price USD ",test.price.iloc[-1] 

max profit at 2015-01-30 03:13:03 with profit of USD  47.79 and bitcoin price USD  236.57
profit at 2015-01-29 09:21:27 with profit of USD  0 and bitcoin price USD  238.08
profit at 2015-01-30 04:11:02 with profit of USD  46.96 and bitcoin price USD  236.03


We see some profit over time. Net profit is 47$ (20%) for a single bitcoin trading over the trading period of one day. 