In [80]:
import datetime as dt

import numpy as np
import pandas as pd

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [81]:
df = pd.read_csv('crypto-markets.csv')

In [82]:
df.head()

Unnamed: 0,slug,symbol,name,date,ranknow,open,high,low,close,volume
0,bitcoin,BTC,Bitcoin,4/28/13,1,135.3,135.98,132.1,134.21,0.0
1,bitcoin,BTC,Bitcoin,4/29/13,1,134.44,147.49,134.0,144.54,0.0
2,bitcoin,BTC,Bitcoin,4/30/13,1,144.0,146.93,134.05,139.0,0.0
3,bitcoin,BTC,Bitcoin,5/1/13,1,139.0,139.89,107.72,116.99,0.0
4,bitcoin,BTC,Bitcoin,5/2/13,1,116.38,125.6,92.28,105.21,0.0


In [118]:
df['date']=pd.to_datetime(df['date'], errors='raise')

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x1a34c57470>>
Traceback (most recent call last):
  File "/Users/Christina/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [142]:
df = df.drop(['symbol'], axis=1)

In [143]:
df['hlc_average'] = (df['high'] + df['low'] + df['close']) / 3
df['ohlc_average'] = (df['open'] + df['high'] + df['low'] + df['close']) / 4

In [144]:
df.head()

Unnamed: 0,slug,name,date,ranknow,open,high,low,close,volume,hlc_average,ohlc_average
1344,bitcoin,Bitcoin,2017-01-01,1,963.66,1003.08,958.7,998.33,147775000.0,986.703333,980.9425
1345,bitcoin,Bitcoin,2017-01-02,1,998.62,1031.39,996.7,1021.75,222185000.0,1016.613333,1012.115
1346,bitcoin,Bitcoin,2017-01-03,1,1021.6,1044.08,1021.6,1043.84,185168000.0,1036.506667,1032.78
1347,bitcoin,Bitcoin,2017-01-04,1,1044.4,1159.42,1044.4,1154.73,344946000.0,1119.516667,1100.7375
1348,bitcoin,Bitcoin,2017-01-05,1,1156.73,1191.1,910.42,1013.38,510199000.0,1038.3,1067.9075


In [145]:
groupby = df.groupby('date', as_index=False).sum()
groupby

Unnamed: 0,date,ranknow,open,high,low,close,volume,hlc_average,ohlc_average
0,2017-01-01,417403,3616.32,3817.89,3533.44,3802.314385,1.936465e+08,3717.881462,3692.491096
1,2017-01-02,432118,3822.81,4023.32,3682.31,3799.611917,2.829008e+08,3835.080639,3832.012979
2,2017-01-03,439714,3795.94,3954.52,3728.24,3850.622806,2.667176e+08,3844.460935,3832.330701
3,2017-01-04,450766,3853.40,4412.57,3774.55,4308.159530,4.450464e+08,4165.093177,4087.169882
4,2017-01-05,444446,4310.55,4487.38,3377.70,3726.016149,6.208330e+08,3863.698716,3975.411537
5,2017-01-06,437570,3729.47,3843.55,3317.03,3377.331514,4.250679e+08,3512.637171,3566.845378
6,2017-01-07,441999,2065.19,2141.23,1891.84,2068.685594,3.477635e+08,2033.918531,2041.736398
7,2017-01-08,436463,3410.61,3539.98,3341.12,3488.747203,2.058830e+08,3456.615734,3445.114301
8,2017-01-09,447706,3490.44,3541.33,3310.19,3362.991270,1.970657e+08,3404.837090,3426.237818
9,2017-01-10,449702,3361.76,3498.13,3331.57,3415.040799,1.576378e+08,3414.913600,3401.625200


In [146]:
trace0 = go.Scatter(
    x=groupby['date'], y=groupby['hlc_average'],
    name='HLC Average'
)

trace1 = go.Scatter(
    x=groupby['date'], y=groupby['volume'],
    name='Volume', yaxis='y2'
)

data = [trace0, trace1]
layout = go.Layout(
    title='General Overview',
    yaxis={
        'title': 'USD',
        'nticks': 10,
    },
    yaxis2={
        'title': 'Transactions',
        'nticks': 5,
        'showgrid': False,
        'overlaying': 'y',
        'side': 'right'
    }
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='time-series-overview')

In [147]:
df = df[df['date'] >= dt.date(2017, 1, 1)]

In [148]:
bitcoin = df[df['ranknow'] == 1]

others = df[(df['ranknow'] > 1) & (df['ranknow'] <= 10)]
others = others.groupby('date', as_index=False).mean()

minor = df[df['ranknow'] > 10]
minor = minor.groupby('date', as_index=False).mean()

In [149]:
fig = tools.make_subplots(rows=1, cols=2, subplot_titles=(
    'Crypto Currency Price', 'Transaction Volume'
))

trace0 = go.Scatter(x=bitcoin['date'], y=bitcoin['hlc_average'], name='Bitcoin')
fig.append_trace(trace0, 1, 1)

trace1 = go.Scatter(x=bitcoin['date'], y=bitcoin['volume'], name='Bitcoin')
fig.append_trace(trace1, 1, 2)


trace2 = go.Scatter(x=others['date'], y=others['hlc_average'], name='Others')
fig.append_trace(trace2, 1, 1)

trace3 = go.Scatter(x=others['date'], y=others['volume'], name='Others')
fig.append_trace(trace3, 1, 2)

trace4 = go.Scatter(x=minor['date'], y=minor['hlc_average'], name='Minor ones')
fig.append_trace(trace4, 1, 1)

trace5 = go.Scatter(x=minor['date'], y=minor['volume'], name='Minor ones')
fig.append_trace(trace5, 1, 2)

fig['layout'].update(title='BitCoin vs others')
fig['layout'].update(showlegend=False)
fig['layout']['yaxis1'].update(title='USD')
fig['layout']['yaxis2'].update(title='Transactions')
fig['layout']['xaxis1'].update(nticks=6)
fig['layout']['xaxis2'].update(nticks=6)

py.iplot(fig, filename='bitcoin-vs-others')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [150]:
top9 = df[(df['ranknow'] >= 2) & (df['ranknow'] <= 10)]
top9.name.unique()

array(['Ethereum', 'Ripple', 'Bitcoin Cash', 'Litecoin', 'EOS', 'Cardano',
       'Stellar', 'NEO', 'IOTA'], dtype=object)

In [151]:
fig = tools.make_subplots(rows=1, cols=2, subplot_titles=(
    'Crypto Currency Price', 'Transaction Volume'
))

for name in top9.name.unique():
    crypto = top9[top9['name'] == name]
    trace0 = go.Scatter(x=crypto['date'], y=crypto['hlc_average'], name=name)
    fig.append_trace(trace0, 1, 1)
    
    trace1 = go.Scatter(x=crypto['date'], y=crypto['volume'], name=name)
    fig.append_trace(trace1, 1, 2)

fig['layout'].update(title='Other Crypto Currencies Comparison')
fig['layout'].update(showlegend=False)
fig['layout']['yaxis1'].update(title='USD')
fig['layout']['yaxis2'].update(title='Transactions')
fig['layout']['xaxis1'].update(nticks=6)
fig['layout']['xaxis2'].update(nticks=6)

py.iplot(fig, filename='other-crypto-currencies-comparison')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [152]:
summary = top9.groupby('name', as_index=False).mean()
summary.sort_values('close', ascending=True)

Unnamed: 0,name,ranknow,open,high,low,close,volume,hlc_average,ohlc_average
8,Stellar,8,0.107751,0.116771,0.097684,0.108728,65926910.0,0.107727,0.107733
1,Cardano,7,0.288239,0.312102,0.261136,0.288909,229574200.0,0.287383,0.287597
7,Ripple,3,0.398686,0.42951,0.367728,0.39976,583565100.0,0.399,0.398921
4,IOTA,10,1.384126,1.493427,1.257308,1.385273,104428000.0,1.378669,1.380033
2,EOS,6,4.542313,4.886828,4.170784,4.559202,301429000.0,4.538938,4.539782
6,NEO,9,33.668753,36.031782,30.854031,33.783129,103395100.0,33.556314,33.584424
5,Litecoin,5,76.8098,81.161114,72.134833,77.101024,430296400.0,76.79899,76.801693
3,Ethereum,2,349.163808,365.53314,330.962339,350.166147,1251846000.0,348.887209,348.956359
0,Bitcoin Cash,4,1106.709472,1194.996057,1021.766301,1107.688902,977048100.0,1108.15042,1107.790183


In [153]:
low_price = top9[top9['ranknow'].isin([4, 6, 7, 9])]
low_price = low_price.groupby('date', as_index=False).mean()

high_price = top9[top9['ranknow'].isin([2, 3, 5, 8, 10])]
high_price = high_price.groupby('date', as_index=False).mean()

In [154]:
fig = tools.make_subplots(rows=1, cols=2, subplot_titles=(
    'Crypto Currency Price', 'Transaction Volume'
))

trace0 = go.Scatter(x=low_price['date'], y=low_price['hlc_average'], name='Low Price')
fig.append_trace(trace0, 1, 1)

trace1 = go.Scatter(x=low_price['date'], y=low_price['volume'], name='Low Price')
fig.append_trace(trace1, 1, 2)

trace2 = go.Scatter(x=high_price['date'], y=high_price['hlc_average'], name='High Price')
fig.append_trace(trace2, 1, 1)

trace3 = go.Scatter(x=high_price['date'], y=high_price['volume'], name='High Price')
fig.append_trace(trace3, 1, 2)

fig['data'][0].update(yaxis='y3')
fig['layout'].update(title='High vs Low Prices Comparison')
fig['layout'].update(showlegend=False)
fig['layout']['yaxis1'].update(title='USD')
fig['layout']['yaxis2'].update(title='Transactions')
fig['layout']['xaxis1'].update(nticks=6)
fig['layout']['xaxis2'].update(nticks=6)
fig['layout']['yaxis3'] = {
    'anchor': 'x1', 'domain': [0.0, 1.0], 'nticks': 6,
    'overlaying': 'y1', 'side': 'right', 'showgrid': False
}

py.iplot(fig, filename='high-vs-low-prices-comparison')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [155]:
currency = df[df['name'] == 'Bitcoin'].copy()
currency.tail()

Unnamed: 0,slug,name,date,ranknow,open,high,low,close,volume,hlc_average,ohlc_average
1824,bitcoin,Bitcoin,2018-04-26,1,8867.32,9281.51,8727.09,9281.51,8970560000.0,9096.703333,9039.3575
1825,bitcoin,Bitcoin,2018-04-27,1,9290.63,9375.47,8987.05,8987.05,7566290000.0,9116.523333,9160.05
1826,bitcoin,Bitcoin,2018-04-28,1,8939.27,9412.09,8931.99,9348.48,7805480000.0,9230.853333,9157.9575
1827,bitcoin,Bitcoin,2018-04-29,1,9346.41,9531.49,9193.71,9419.08,8853000000.0,9381.426667,9372.6725
1828,bitcoin,Bitcoin,2018-04-30,1,9426.11,9477.14,9166.81,9240.55,8673920000.0,9294.833333,9327.6525


In [156]:
increasing_color = '#17BECF'
decreasing_color = '#7F7F7F'

data = []

layout = {
    'xaxis': {
        'rangeselector': {
            'visible': True
        }
    },
    # Adding a volume bar chart for candlesticks is a good practice usually
    'yaxis': {
        'domain': [0, 0.2],
        'showticklabels': False
    },
    'yaxis2': {
        'domain': [0.2, 0.8]
    },
    'legend': {
        'orientation': 'h',
        'y': 0.9,
        'yanchor': 'bottom'
    },
    'margin': {
        't': 40,
        'b': 40,
        'r': 40,
        'l': 40
    }
}

# Defining main chart
trace0 = go.Candlestick(
    x=currency['date'], open=currency['open'], high=currency['high'],
    low=currency['low'], close=currency['close'],
    yaxis='y2', name='Bitcoin',
    increasing=dict(line=dict(color=increasing_color)),
    decreasing=dict(line=dict(color=decreasing_color)),
)

data.append(trace0)

# Adding some range buttons to interact
rangeselector = {
    'visible': True,
    'x': 0,
    'y': 0.8,
    'buttons': [
        {'count': 1, 'label': 'reset', 'step': 'all'},
        {'count': 6, 'label': '6 mo', 'step': 'month', 'stepmode': 'backward'},
        {'count': 3, 'label': '3 mo', 'step': 'month', 'stepmode': 'backward'},
        {'count': 1, 'label': '1 mo', 'step': 'month', 'stepmode': 'backward'},
    ]
}

layout['xaxis'].update(rangeselector=rangeselector)

# Setting volume bar chart colors
colors = []
for i, _ in enumerate(currency['date']):
    if i != 0:
        if currency['close'].iloc[i] > currency['close'].iloc[i-1]:
            colors.append(increasing_color)
        else:
            colors.append(decreasing_color)
    else:
        colors.append(decreasing_color)

trace1 = go.Bar(
    x=currency['date'], y=currency['volume'],
    marker=dict(color=colors),
    yaxis='y', name='Volume'
)

data.append(trace1)

# Adding Moving Average
def moving_average(interval, window_size=10):
    window = np.ones(int(window_size)) / float(window_size)
    return np.convolve(interval, window, 'same')

trace2 = go.Scatter(
    x=currency['date'][5:-5], y=moving_average(currency['close'])[5:-5],
    yaxis='y2', name='Moving Average',
    line=dict(width=1)
)

data.append(trace2)

# Adding boilinger bands
def bollinger_bands(price, window_size=10, num_of_std=5):
    rolling_mean = price.rolling(10).mean()
    rolling_std = price.rolling(10).std()
    upper_band = rolling_mean + (rolling_std * 5)
    lower_band = rolling_mean - (rolling_std * 5)
    return upper_band, lower_band

bb_upper, bb_lower = bollinger_bands(currency['close'])

trace3 = go.Scatter(
    x=currency['date'], y=bb_upper,
    yaxis='y2', line=dict(width=1),
    marker=dict(color='#ccc'), hoverinfo='none',
    name='Bollinger Bands',
    legendgroup='Bollinger Bands'
)
data.append(trace3)

trace4 = go.Scatter(
    x=currency['date'], y=bb_lower,
    yaxis='y2', line=dict(width=1),
    marker=dict(color='#ccc'), hoverinfo='none',
    name='Bollinger Bands', showlegend=False,
    legendgroup='Bollinger Bands'
)
data.append(trace4)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Bitcoin-candlestick')

In [157]:
currency['target'] = currency['close'].shift(-30)

In [158]:
X = currency.dropna().copy()
X['year'] = X['date'].apply(lambda x: x.year)
X['month'] = X['date'].apply(lambda x: x.month)
X['day'] = X['date'].apply(lambda x: x.day)
X = X.drop(['date', 'slug', 'name', 'ranknow', 'target'], axis=1)

y = currency.dropna()['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train.shape, X_test.shape

((364, 10), (91, 10))

In [159]:
forecast = currency[currency['target'].isnull()]
forecast = forecast.drop('target', axis=1)

X_forecast = forecast.copy()
X_forecast['year'] = X_forecast['date'].apply(lambda x: x.year)
X_forecast['month'] = X_forecast['date'].apply(lambda x: x.month)
X_forecast['day'] = X_forecast['date'].apply(lambda x: x.day)
X_forecast = X_forecast.drop(['date', 'slug', 'name', 'ranknow'], axis=1)

In [160]:
currency = currency.drop('target', axis=1)

In [161]:
classifiers = {
    'LinearRegression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=1),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500)
}

In [162]:
X_train

Unnamed: 0,open,high,low,close,volume,hlc_average,ohlc_average,year,month,day
1616,4166.11,4358.43,4160.86,4338.71,1.207450e+09,4286.000000,4256.0275,2017,9,30
1687,15168.40,15850.60,13226.60,15455.40,1.343330e+10,14844.200000,14925.2500,2017,12,10
1662,6634.76,7342.25,6634.76,7315.54,4.200880e+09,7097.516667,6981.8275,2017,11,15
1670,8232.38,8267.40,8038.77,8038.77,4.225180e+09,8114.980000,8144.3300,2017,11,23
1612,3928.41,3969.89,3869.90,3892.35,1.043740e+09,3910.713333,3915.1375,2017,9,26
1795,7836.83,8122.89,7809.17,7954.48,4.935290e+09,7962.180000,7930.8425,2018,3,28
1671,8074.02,8374.16,7940.93,8253.69,5.058610e+09,8189.593333,8160.7000,2017,11,24
1424,1120.65,1120.65,1014.21,1049.14,3.808410e+08,1061.333333,1076.1625,2017,3,22
1390,1007.65,1033.37,1007.65,1027.44,1.222770e+08,1022.820000,1019.0275,2017,2,16
1565,3341.84,3453.45,3319.47,3381.28,1.515110e+09,3384.733333,3374.0100,2017,8,10


In [163]:
summary = list()
for name, clf in classifiers.items():
    print(name)
    nada = clf.fit(X_train, y_train)
    
    print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}')
    print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}')
    print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test)):.2f}')
    print()
    
    summary.append({
        'MSE': mean_squared_error(y_test, clf.predict(X_test)),
        'MAE': mean_absolute_error(y_test, clf.predict(X_test)),
        'R2': r2_score(y_test, clf.predict(X_test)),
        'name': name,
    })

LinearRegression
R2: 0.75
MAE: 1549.59
MSE: 4973478.57

Random Forest Regressor
R2: 0.98
MAE: 353.63
MSE: 346883.67

Gradient Boosting Regressor
R2: 0.97
MAE: 445.29
MSE: 552113.63



In [164]:
dtrain = xgb.DMatrix(X_train.values, y_train.values)
dtest = xgb.DMatrix(X_test.values)

param = {
    'max_depth': 10,
    'eta': 0.3
}
num_round = 20
bst = xgb.train(param, dtrain, num_round)
# make prediction
print('XGBoost')
print(f'R2: {r2_score(y_test, bst.predict(dtest)):.2f}')
print(f'MAE: {mean_absolute_error(y_test, bst.predict(dtest)):.2f}')
print(f'MSE: {mean_squared_error(y_test, bst.predict(dtest)):.2f}')

summary.append({
    'MSE': mean_squared_error(y_test, bst.predict(dtest)),
    'MAE': mean_absolute_error(y_test, bst.predict(dtest)),
    'R2': r2_score(y_test, bst.predict(dtest)),
    'name': 'XGBoost',
})

XGBoost
R2: 0.99
MAE: 306.62
MSE: 240639.56


In [165]:
summary = pd.DataFrame(summary)

fig = tools.make_subplots(rows=1, cols=3, subplot_titles=(
    'R2', 'MAE', 'MSE'
))

trace0 = go.Bar(x=summary['name'], y=summary['R2'], name='R2')
fig.append_trace(trace0, 1, 1)

trace1 = go.Bar(x=summary['name'], y=summary['MAE'], name='MAE')
fig.append_trace(trace1, 1, 2)

trace2 = go.Bar(x=summary['name'], y=summary['MSE'], name='MSE')
fig.append_trace(trace2, 1, 3)

fig['layout'].update(title='Regression Metrics Comparison')
fig['layout'].update(showlegend=False)

py.iplot(fig, filename='regression-metrics-comparison')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]



In [166]:
clf = RandomForestRegressor(n_estimators=100, random_state=1)
clf.fit(X_train, y_train)
target = clf.predict(X_forecast)

final = pd.concat([currency, forecast])
final = final.groupby('date').mean()

day_one_forecast = currency.iloc[-1].date + dt.timedelta(days=1)
date = pd.date_range(day_one_forecast, periods=30, freq='D')
predictions = pd.DataFrame(target, columns=['target'], index=date)
final = final.append(predictions)
final.index.names = ['date']
final = final.reset_index()

trace0 = go.Scatter(
    x=final['date'], y=final['close'],
    name='Close'
)

trace1 = go.Scatter(
    x=final['date'], y=final['target'],
    name='Target'
)

data = [trace0, trace1]
layout = go.Layout(
    title='Prediction Visualization',
    yaxis={
        'title': 'USD',
        'nticks': 10,
    },
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='prediction-visualization')