## Config

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from punisher.common import *

import itertools
import sklearn
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn import preprocessing
import statsmodels.formula.api as smf

## Helpers

In [None]:
def plot_AB(y1, y2, x=None, fs=(20,12), title=None):
    if x is None:
        x = np.array([i for i in range(len(y1))])
    fig, ax = plt.subplots()
    fig.set_size_inches(fs)
    plt.title(title)
    plt.plot(x, y1, label='preds')
    plt.plot(x, y2, label='targs')
    plt.grid()
    legend = ax.legend(loc='upper left')
    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def get_price_data(asset, exchange_id, timeframe, start, end, lead=750, lag=3000):
    exchange = load_exchange(exchange_id)
    fpath = ohlcv_feed.get_ohlcv_fpath(asset, exchange_id, timeframe)
    if not os.path.exists(fpath):
        print(fpath)
        ohlcv_feed.fetch_and_save_asset(exchange, asset, timeframe, start, end)
    df = ohlcv_feed.load_asset(fpath)
    df.sort_values(by='utc', inplace=True)
    close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
    volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
    df['lead'] = df[close_col].rolling(lead).mean()
    df['lag'] = df[close_col].rolling(lag).mean()
    #df.dropna(inplace=True)
    return df

def plot_price(df, field, asset, ex_id):
    col_name = ohlcv_feed.get_col_name(field, asset.symbol, ex_id)
    punisher.utils.charts.plot_range(
        df, start=None, end=None, 
        column_name=col_name)

## Data Feed

In [None]:
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.ONE_DAY
exchange = load_exchange(exchange_id)
df = get_price_data(asset, exchange_id, timeframe, start, end)
close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
len(df)

In [None]:
train = df[df['utc'] < datetime.datetime(year=2017, month=9, day=1)]
val = df[df['utc'] >= datetime.datetime(year=2017, month=9, day=1)]

In [None]:
plot_price(train, 'close', asset, exchange_id)

## SMA

* https://medium.com/@eliquinox/cryptocurrency-data-analysis-part-iii-backtesting-evaluating-and-optimising-a-trading-strategy-9bc9b1179a8b
* https://www.investopedia.com/university/movingaverage/movingaverages4.asp
* https://blog.patricktriest.com/analyzing-cryptocurrencies-python/
* https://github.com/AdamStone/cryptrade
* https://pythonprogramming.net/advanced-matplotlib-graphing-charting-tutorial/
* https://www.tradingview.com/script/TuG4VjJX-Crypto-Adjusted-Moving-Average-CAMA/
* https://www.tradingview.com/cryptocurrency-signals/
* https://romanorac.github.io/cryptocurrency/analysis/2017/12/29/cryptocurrency-analysis-with-python-part3.html

In [None]:
sma_df = df.copy()
sma_df.head()

In [None]:
sma_df['lead'] = sma_df[close_col].rolling(250).mean()
sma_df['lag'] = sma_df[close_col].rolling(500).mean()

In [None]:
sma_df[[close_col,'lead','lag']].plot(figsize = (16,10))

In [None]:
lead, lag = 1000, 3000
pc_thresh = .025

ma_df = sma_df.copy()
ma_df['lead'] = ma_df[close_col].rolling(lead).mean()
ma_df['lag'] = ma_df[close_col].rolling(lag).mean()
ma_df.dropna(inplace = True)
ma_df['lead-lag'] = ma_df['lead'] - ma_df['lag']
ma_df['pc_diff'] = ma_df['lead-lag'] / ma_df[close_col]
ma_df['regime'] = np.where(ma_df['pc_diff'] > pc_thresh, 1, 0)
ma_df['regime'] = np.where(ma_df['pc_diff'] < -pc_thresh, -1, ma_df['regime'])
ma_df['Market'] = np.log(ma_df[close_col] / ma_df[close_col].shift(1))
ma_df['Strategy'] = ma_df['regime'].shift(1) * ma_df['Market']
ma_df[['Market','Strategy']] = ma_df[['Market','Strategy']].cumsum().apply(np.exp)
ma_df.dropna(inplace=True)

In [None]:
ma_df['regime'].plot(figsize=(16,5))

In [None]:
ma_df[['Market','Strategy']].iloc[-1]
ma_df[['Market','Strategy']].plot(figsize = (16,10))

## MACD

* https://romanorac.github.io/cryptocurrency/analysis/2017/12/17/cryptocurrency-analysis-with-python-part1.html

## Linear Regression (OLS)

* http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
* http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
* http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [None]:
# Numpy Close Data Array
all_utc_arr = np.array(df['utc'])
print(all_utc_arr.shape)

trn_arr = np.array(train[close_col])
trn_utc_arr = np.array(train['utc'])

val_arr = np.array(val[close_col])
val_utc_arr = np.array(val['utc'])
print(trn_arr.shape, trn_utc_arr.shape,val_arr.shape,val_utc_arr.shape)

plt.plot(trn_utc_arr, trn_arr)
plt.plot(val_utc_arr, val_arr)

In [None]:
def get_inp_targs(arr, prior_periods, target_period):
    # Categorical
    # 30 minute timeframe
    # 2 years of data = 35088
    # 48 periods / day
    # Input = 24 periods, output = lower/neutral/higher 
    # (after 12 periods)
    inp = []
    targs = []
    for i in range(0, len(arr[:-target_period])):
        start_close = arr[i]
        end_close = arr[i+target_period]
        pct_delta = (end_close - start_close) / start_close
        inp.append(arr[i:i+prior_periods])
        targs.append(pct_delta)
    inp = np.array(inp)
    targs = np.expand_dims(np.array(targs), axis=1)
    print(inp.shape, targs.shape)
    return inp,targs

In [None]:
trn_inp,trn_targs = get_inp_targs(trn_arr, 12, 24)
val_inp,val_targs = get_inp_targs(val_arr, 12, 24)

In [None]:
linreg = LinearRegression(normalize=True)

linreg.fit(trn_inp, trn_targs)

In [None]:
# Intercept and Coefficients
print (linreg.intercept_)
print (linreg.coef_)

In [None]:
# Predict
trn_preds = linreg.predict(trn_inp)
val_preds = linreg.predict(val_inp)
trn_preds.shape, trn_targs.shape, trn_utc_arr.shape

In [None]:
trn_utc_arr = trn_utc_arr[:len(trn_targs)]
plot_AB(trn_preds, trn_targs, x=np.expand_dims(trn_utc_arr,1), 
        title='Linear Regress Train Preds')

In [None]:
val_utc_arr = val_utc_arr[:len(val_targs)]
plot_AB(val_preds, val_targs, x=np.expand_dims(val_utc_arr,1), title='Linear Regress Val Preds')

In [None]:
# http://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/
print("MAE", metrics.mean_absolute_error(trn_targs, trn_preds))
print("MSE", metrics.mean_squared_error(trn_targs, trn_preds))
print("RMSE", np.sqrt(metrics.mean_squared_error(trn_targs, trn_preds)))

In [None]:
# http://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/
print("MAE", metrics.mean_absolute_error(val_targs, val_preds))
print("MSE", metrics.mean_squared_error(val_targs, val_preds))
print("RMSE", np.sqrt(metrics.mean_squared_error(val_targs, val_preds)))

## Logistic Regression (Close)

* https://github.com/bfortuner/ml-study/blob/master/LogisticRegression.ipynb

In [None]:
def get_labels(pct_targs, threshold):
    targs = []
    for targ in pct_targs:
        targ = targ[0]
        if targ < -threshold:
            targs.append(0)
        elif targ > threshold:
            targs.append(2)
        else:
            targs.append(1)
    return np.expand_dims(np.array(targs),1)

def get_one_hot_categorical(targs):
    onehots = preprocessing.OneHotEncoder()
    return onehots.fit(targs).transform(targs).toarray()

# trn_onehots = get_one_hot_categorical(trn_targs, .01)
# val_onehots = get_one_hot_categorical(trn_targs, .01)

In [None]:
# Numpy Close Data Array
all_utc_arr = np.array(df['utc'])
print(all_utc_arr.shape)

trn_arr = np.array(train[close_col])
trn_utc_arr = np.array(train['utc'])

val_arr = np.array(val[close_col])
val_utc_arr = np.array(val['utc'])
print(trn_arr.shape, trn_utc_arr.shape,val_arr.shape,val_utc_arr.shape)

plt.plot(trn_utc_arr, trn_arr)
plt.plot(val_utc_arr, val_arr)

In [None]:
threshold = .02
trn_inp,trn_targs = get_inp_targs(trn_arr, 12, 24)
val_inp,val_targs = get_inp_targs(val_arr, 12, 24)
trn_labels = get_labels(trn_targs, threshold).ravel()   #Logistic regression expects (n,) shape
val_labels = get_labels(val_targs, threshold).ravel()
normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
trn_inp = normalized_range.fit_transform(trn_inp)
val_inp = normalized_range.fit_transform(val_inp)

trn_labels.shape,val_labels.shape

In [None]:
model = LogisticRegression()
model.fit(trn_inp, trn_labels)

In [None]:
# Predict class labels
trn_preds = model.predict(trn_inp)
val_preds = model.predict(val_inp)
trn_preds,val_preds

In [None]:
# Predict probabilities
trn_probs = model.predict_proba(trn_inp)
val_probs = model.predict_proba(val_inp)
trn_probs.shape,val_probs.shape

preds = np.argmax(trn_probs,axis=1)
preds

In [None]:
# Accuracy
print("Trn Acc", model.score(trn_inp, trn_labels))
print("Val Acc", model.score(val_inp, val_labels))

In [None]:
# Log Loss
print("Trn Cross Entropy", metrics.log_loss(trn_labels, trn_probs))
print("Val Cross Entropy", metrics.log_loss(val_labels, val_probs))

In [None]:
np.unique(trn_preds)

In [None]:
# targets
plt.hist(trn_preds, bins=3, range=(0,2))

In [None]:
plt.hist(trn_labels, bins=3, alpha=0.5, label='label')
plt.hist(trn_preds, bins=3, alpha=0.5, label='pred')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.hist(val_labels, bins=3, alpha=0.5, label='label')
plt.hist(val_preds, bins=3, alpha=0.5, label='pred')
plt.legend(loc='upper right')
plt.show()

In [None]:
# Accuracy
print (metrics.accuracy_score(trn_labels, trn_preds))
print (metrics.accuracy_score(val_labels, val_preds))

In [None]:
# Classification Report
print (metrics.classification_report(val_labels, val_preds))

In [None]:
# Confusion Matrix
# https://github.com/bfortuner/ml-study/blob/master/tools/ConfusionMatrix.ipynb 
print (metrics.confusion_matrix(val_labels, val_preds))

In [None]:
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(val_labels, val_preds)
plt.figure()
plot_confusion_matrix(
    cnf_matrix, classes=['down','neutral','positive'],
    title='Confusion matrix')

## Logistic Regression (Close + Volume)

In [None]:
def get_inp_targs(arr, prior_periods, target_period):
    # Categorical
    # 30 minute timeframe
    # 2 years of data = 35088
    # 48 periods / day
    # Input = 24 periods, output = lower/neutral/higher 
    # (after 12 periods)
    inp = []
    targs = []
    for i in range(0, len(arr[:-target_period,:])):
        start_close = arr[i][0]
        end_close = arr[i+target_period][0]
        pct_delta = (end_close - start_close) / start_close
        inp.append(arr[i:i+prior_periods])
        targs.append(pct_delta)
    inp = np.array(inp)
    n_samples, timesteps, cols = inp.shape
    inp = inp.reshape((n_samples, timesteps * cols))
    targs = np.expand_dims(np.array(targs), axis=1)
    print(inp.shape, targs.shape)
    return inp,targs

In [None]:
split_date =  datetime.datetime(year=2017, month=9, day=1)
train = df[df['utc'] < split_date]
val = df[df['utc'] >= split_date]

In [None]:
# Numpy Close Data Array
all_utc_arr = np.array(df['utc'])
print(all_utc_arr.shape)

trn_arr = np.array(train[[close_col, volume_col]])
trn_utc_arr = np.array(train['utc'])

val_arr = np.array(val[[close_col, volume_col]])
val_utc_arr = np.array(val['utc'])
print(trn_arr.shape, trn_utc_arr.shape,val_arr.shape,val_utc_arr.shape)

# Price
plt.plot(trn_utc_arr, trn_arr[:,0])
plt.plot(val_utc_arr, val_arr[:,0])

In [None]:
# Volume
plt.plot(trn_utc_arr, trn_arr[:,1])
plt.plot(val_utc_arr, val_arr[:,1])

In [None]:
threshold = .02
trn_inp,trn_targs = get_inp_targs(trn_arr, 12, 24)
val_inp,val_targs = get_inp_targs(val_arr, 12, 24)
trn_labels = get_labels(trn_targs, threshold).ravel()   #Logistic regression expects (n,) shape
val_labels = get_labels(val_targs, threshold).ravel()
normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
trn_inp = normalized_range.fit_transform(trn_inp)
val_inp = normalized_range.fit_transform(val_inp)

In [None]:
model = LogisticRegression()
model.fit(trn_inp, trn_labels)

In [None]:
# Predict class labels
trn_preds = model.predict(trn_inp)
val_preds = model.predict(val_inp)

In [None]:
# Predict probabilities
trn_probs = model.predict_proba(trn_inp)
val_probs = model.predict_proba(val_inp)
trn_probs.shape,val_probs.shape

trn_preds,trn_probs,val_preds,val_probs

In [None]:
plt.hist(trn_labels, bins=3, alpha=0.5, label='label')
plt.hist(trn_preds, bins=3, alpha=0.5, label='pred')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.hist(val_labels, bins=3, alpha=0.5, label='label')
plt.hist(val_preds, bins=3, alpha=0.5, label='pred')
plt.legend(loc='upper right')
plt.show()

In [None]:
# Accuracy
print ("Trn Acc", metrics.accuracy_score(trn_labels, trn_preds))
print ("Val Acc", metrics.accuracy_score(val_labels, val_preds))

In [None]:
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(trn_labels, trn_preds)
plt.figure()
plot_confusion_matrix(
    cnf_matrix, classes=['down','neutral','positive'],
    title='Confusion matrix')

In [None]:
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(val_labels, val_preds)
plt.figure()
plot_confusion_matrix(
    cnf_matrix, classes=['down','neutral','positive'],
    title='Confusion matrix')

## Logistic Regression (Method)

In [None]:
def get_inp_targs(arr, prior_periods, target_period):
    # Categorical
    # 30 minute timeframe
    # 2 years of data = 35088
    # 48 periods / day
    # Input = 24 periods, output = lower/neutral/higher 
    # (after 12 periods)
    inp = []
    targs = []
    for i in range(0, len(arr[:-target_period,:])):
        start_close = arr[i][0]
        end_close = arr[i+target_period][0]
        pct_delta = (end_close - start_close) / start_close
        inp.append(arr[i:i+prior_periods])
        targs.append(pct_delta)
    inp = np.array(inp)
    n_samples, timesteps, cols = inp.shape
    inp = inp.reshape((n_samples, timesteps * cols))
    targs = np.expand_dims(np.array(targs), axis=1)
    return inp,targs

def get_log_reg_inputs(df, columns, prior_periods, target_period):
    arr = np.array(df[columns])
    threshold = .02
    inp,targs = get_inp_targs(arr, prior_periods, target_period)
    labels = get_labels(targs, threshold).ravel()
    normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
    inp = normalized_range.fit_transform(inp)
    return inp,targs,labels

def train_model(trn_inp, trn_labels):
    model = LogisticRegression()
    model.fit(trn_inp, trn_labels)
    return model

def predict(model, inp):
    # Predict class labels
    preds = model.predict(inp)
    probs = model.predict_proba(inp)
    return preds,probs

def evaluate(preds, probs, labels, plot_charts=True):
    if plot_charts:
        cnf_matrix = metrics.confusion_matrix(labels, preds)
        plt.figure()
        plot_confusion_matrix(
            cnf_matrix, classes=['down','neutral','positive'],
            title='Confusion matrix')
        plt.show()
        plt.hist(labels, bins=3, alpha=0.5, label='label')
        plt.hist(preds, bins=3, alpha=0.5, label='pred')
        plt.legend(loc='upper right')
        plt.show()
    acc = metrics.accuracy_score(labels, preds)
    logloss = metrics.log_loss(labels, probs)
    return acc, logloss

In [None]:
def run_experiment(columns, prior_periods, target_period):
    trn_inp,trn_targs,trn_labels = get_log_reg_inputs(
        train, columns, prior_periods, target_period)
    val_inp,val_targs,val_labels = get_log_reg_inputs(
        val, columns, prior_periods, target_period)

    model = train_model(trn_inp, trn_labels)

    trn_preds,trn_probs = predict(model, trn_inp)
    val_preds,val_probs = predict(model, val_inp)

    acc, loss = evaluate(trn_preds, trn_probs, trn_labels, False)
    print("Trn - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))
    acc, loss = evaluate(val_preds, val_probs, val_labels, False)
    print("val - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))
        

In [None]:
# Prep inputs
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.ONE_DAY
exchange = load_exchange(exchange_id)
df = get_price_data(asset, exchange_id, timeframe, start, end)

In [None]:
target_period = 24
prior_periods = 12

close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
columns = [close_col, volume_col]

split_date =  datetime.datetime(year=2017, month=9, day=1)
train = df[df['utc'] < split_date]
val = df[df['utc'] >= split_date]
trn_inp,trn_targs,trn_labels = get_log_reg_inputs(
    train, columns, prior_periods, target_period)
val_inp,val_targs,val_labels = get_log_reg_inputs(
    val, columns, prior_periods, target_period)
val_inp.shape,val_targs.shape

In [None]:
prior_periods = 12
target_period = 24
columns = [close_col, volume_col]
run_experiment(columns, prior_periods, target_period)

In [None]:
# close seems best

# 8 prior, 1 - 4 predict
columns = [close_col]#, volume_col]
for i in range(1,len(columns)+1):
    cols = columns[:i]
    for pp in [4,8,12,16,20,24]:
        for tp in [pp+1, pp+2, pp+4, pp+6, pp+10, pp+12]:
            print(pp, tp)
            run_experiment(cols, pp, tp)

In [None]:
"""
Best Results (all 'close')

prior_periods = 4
target_period = 5 (next timestep)
Trn - Acc: 0.8065 Loss: 0.5897
val - Acc: 0.8188 Loss: 0.5779

4 6
Trn - Acc: 0.7819 Loss: 0.6430
val - Acc: 0.7938 Loss: 0.6383

4 8 
Trn - Acc: 0.7344 Loss: 0.7347
val - Acc: 0.7527 Loss: 0.7286

-------

8 9 
Trn - Acc: 0.7240 Loss: 0.6964
val - Acc: 0.7754 Loss: 0.6046

8 10
Trn - Acc: 0.7054 Loss: 0.7337
val - Acc: 0.7552 Loss: 0.6635

8 12 
Trn - Acc: 0.6745 Loss: 0.7906
val - Acc: 0.7310 Loss: 0.7294

-----

12 13
Trn - Acc: 0.6853 Loss: 0.7308
val - Acc: 0.7835 Loss: 0.6291

12 14
Trn - Acc: 0.6716 Loss: 0.7557
val - Acc: 0.7646 Loss: 0.6627

12 16
Trn - Acc: 0.6466 Loss: 0.8015
val - Acc: 0.7300 Loss: 0.7286


Questions:
1) When it's wrong, what are the financial consequences?
2) When it's right, how much do we make?
"""

## SVM

* https://machinelearningmastery.com/time-series-forecasting-supervised-learning/
* https://www.quantstart.com/articles/Forecasting-Financial-Time-Series-Part-1
* http://scikit-learn.org/stable/modules/svm.html

In [None]:
# Prep inputs
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.THIRTY_MIN
exchange = load_exchange(exchange_id)
#ohlcv_feed.fetch_and_save_asset(exchange, asset, timeframe, start, end)
#feed = OHLCVFileFeed([exchange_id], [asset], timeframe, start, end)
fpath = ohlcv_feed.get_ohlcv_fpath(asset, exchange_id, timeframe)
df = ohlcv_feed.load_asset(fpath)
df.sort_values(by='utc', inplace=True)
close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
df['lead'] = df[close_col].rolling(1000).mean()
df['lag'] = df[close_col].rolling(5000).mean()
df.dropna(inplace=True)
df.head()

split_date =  datetime.datetime(year=2017, month=9, day=1)
train = df[df['utc'] < split_date]
val = df[df['utc'] >= split_date]
trn_inp,trn_targs,trn_labels = get_log_reg_inputs(
    train, columns, prior_periods, target_period)
val_inp,val_targs,val_labels = get_log_reg_inputs(
    val, columns, prior_periods, target_period)
print("Val", val_inp.shape,val_targs.shape,val_labels.shape)
unique_counts = np.unique(trn_labels, return_counts=True)
unique_counts[0]
plt.bar(left=unique_counts[0], height=unique_counts[1], tick_label=['negative','neutral', 'positive'])

In [None]:
def run_svm_experiment(columns, prior_periods, target_period):
    trn_inp,trn_targs,trn_labels = get_log_reg_inputs(
        train, columns, prior_periods, target_period)
    val_inp,val_targs,val_labels = get_log_reg_inputs(
        val, columns, prior_periods, target_period)

    model = svm.SVC(probability=True)
    model.fit(trn_inp, trn_labels)
    
    trn_preds,trn_probs = predict(model, trn_inp)
    val_preds,val_probs = predict(model, val_inp)

    acc, loss = evaluate(trn_preds, trn_probs, trn_labels, False)
    print("Trn - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))
    acc, loss = evaluate(val_preds, val_probs, val_labels, False)
    print("val - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))

In [None]:
columns = [close_col]
run_svm_experiment(columns, prior_periods, target_period)

In [None]:
columns = [close_col, volume_col]
run_svm_experiment(columns, prior_periods, target_period)

In [None]:
# 8 prior, 1 - 4 predict
columns = [close_col]#, volume_col, 'lead', 'lag']
for i in range(1,len(columns)+1):
    cols = columns[:i]
    for pp in [4,8,12]#,16,20,24]:
        for tp in [pp+1, pp+2, pp+4]#, pp+6, pp+10, pp+12]:
            print(pp, tp, cols)
            run_svm_experiment(cols, pp, tp)

## Percent Data

In [None]:
# Prep inputs
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.THIRTY_MIN
exchange = load_exchange(exchange_id)
df = get_price_data(asset, exchange_id, timeframe, start, end)
target_period = 24
prior_periods = 12
close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
columns = [close_col, volume_col, 'lead', 'lag']
df.head()

In [None]:
df['pct_change'] = df[[close_col]].pct_change()
df.dropna(inplace=True)

In [None]:
df[['pct_change', close_col]].head()

In [None]:
def get_inp_targs(arr, prior_periods, target_period):
    inp = []
    targs = []
    for i in range(0, len(arr[:-target_period,:])):
        start_close = arr[i][0]
        end_close = arr[i+target_period][0]
        pct_delta = (end_close - start_close) / start_close
        inp.append(arr[i:i+prior_periods])
        targs.append(pct_delta)
    inp = np.array(inp)[:,:,1]
    print(inp.shape)
    n_samples, timesteps = inp.shape
    inp = inp.reshape((n_samples, timesteps))
    targs = np.expand_dims(np.array(targs), axis=1)
    return inp,targs

def get_log_reg_inputs(df, columns, prior_periods, target_period):
    arr = np.array(df[columns])
    threshold = .02
    inp,targs = get_inp_targs(arr, prior_periods, target_period)
    labels = get_labels(targs, threshold).ravel()
    normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
    inp = normalized_range.fit_transform(inp)
    return inp,targs,labels

def run_experiment(columns, prior_periods, target_period):
    trn_inp,trn_targs,trn_labels = get_log_reg_inputs(
        train, columns, prior_periods, target_period)
    val_inp,val_targs,val_labels = get_log_reg_inputs(
        val, columns, prior_periods, target_period)

    model = train_model(trn_inp, trn_labels)

    trn_preds,trn_probs = predict(model, trn_inp)
    val_preds,val_probs = predict(model, val_inp)

    acc, loss = evaluate(trn_preds, trn_probs, trn_labels, False)
    print("Trn - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))
    acc, loss = evaluate(val_preds, val_probs, val_labels, False)
    print("val - Acc: {:.4f} Loss: {:.4f}".format(float(acc), float(loss)))

In [None]:
columns = [close_col, 'pct_change']
split_date =  datetime.datetime(year=2017, month=9, day=1)
train = df[df['utc'] < split_date]
val = df[df['utc'] >= split_date]

In [None]:
cols = [close_col, 'pct_change']
for pp in [4,8,12,16,20,24]:
    for tp in [pp+1, pp+2, pp+4, pp+6, pp+10, pp+12]:
        print(pp, tp, cols)
        run_experiment(cols, pp, tp)

## Random Forest

* https://arxiv.org/pdf/1605.00003.pdf
* https://medium.com/making-sense-of-data/time-series-next-value-prediction-using-regression-over-a-rolling-window-228f0acae363
* http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

## ARIMA

* Auto Regressive Integrated Moving Average
* https://dashee87.github.io/data%20science/general/A-Road-Incident-Model-Analysis/
* https://machinelearningmastery.com/make-sample-forecasts-arima-python/
* https://machinelearningmastery.com/arima-for-time-series-forecasting-with-python/
* https://en.wikipedia.org/wiki/Box%E2%80%93Jenkins_method
* https://www.digitalocean.com/community/tutorials/a-guide-to-time-series-forecasting-with-arima-in-python-3
* https://www.digitalocean.com/community/tutorials/a-guide-to-time-series-forecasting-with-arima-in-python-3

In [None]:
# Very good tutorial here:
# https://www.digitalocean.com/community/tutorials/a-guide-to-time-series-forecasting-with-arima-in-python-3
    
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import autocorrelation_plot
plt.style.use('fivethirtyeight')

In [None]:
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.ONE_DAY
exchange = load_exchange(exchange_id)
df = get_price_data(asset, exchange_id, timeframe, start, end)
df.head()

In [None]:
target_period = 24
prior_periods = 12

close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
columns = [close_col]

close_utc = df[[close_col, 'utc']]
plot_price(close_utc, 'close', asset, exchange_id)

split_date =  datetime.datetime(year=2017, month=9, day=1)
train = close_utc[close_utc['utc'] < split_date]
val = close_utc[close_utc['utc'] >= split_date]
train.set_index('utc', inplace=True)
val.set_index('utc', inplace=True)
close_utc.set_index('utc', inplace=True)

### Plot Autocorrelation

In [None]:
# How correlated is price with last t time periods?

close_utc = df[[close_col, 'utc']]
close_utc.set_index('utc', inplace=True)
close_utc.head()
autocorrelation_plot(close_utc[:50])
plt.show()
autocorrelation_plot(close_utc[:100])
plt.show()
autocorrelation_plot(close_utc[:1000], )
plt.show()

### Model Training

* p = 5
    * is the auto-regressive part of the model. It allows us to incorporate the effect of past values into our model. Intuitively, this would be similar to stating that it is likely to be warm tomorrow if it has been warm the past 3 days.

* d = 1
    * is the integrated part of the model. This includes terms in the model that incorporate the amount of differencing (i.e. the number of past time points to subtract from the current value) to apply to the time series. Intuitively, this would be similar to stating that it is likely to be same temperature tomorrow if the difference in temperature in the last three days has been very small.
* q = 0
    * is the moving average part of the model. This allows us to set the error of our model as a linear combination of the error values observed at previous time points in the past.

In [None]:
# The coef column shows the weight (i.e. importance) of each feature and how each one impacts
# the time series. 
# The P>|z| column informs us of the significance of each feature weight. 
# Each weight has a p-value lower or close to 0.05, so it is reasonable to retain all 
# of them?

model = ARIMA(train, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
# Plot residual errors
# Our primary concern is to ensure that the residuals of our model are uncorrelated and 
# normally distributed with zero-mean. If the seasonal ARIMA model does not satisfy these 
# properties, it is a good indication that it can be further improved.

residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()

# Density plot of the residual error values, 
# Suggesting the errors are Gaussian, but may not be centered on zero.
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

In [None]:
X = close_utc.values
size = int(len(X) * 0.95)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

In [None]:
error = metrics.mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot
plt.figure(figsize=(18,10))
start = 0
end = 100
plt.plot(test[start:end], color='blue')
plt.plot(predictions[start:end], color='red')

## Facebook Prophet

* https://github.com/facebook/prophet
* https://msperlin.github.io/2017-03-05-Prophet-and_stock-market/
* https://facebook.github.io/prophet/
* https://github.com/facebook/prophet/blob/master/notebooks/quick_start.ipynb
* https://github.com/facebook/prophet/blob/master/notebooks/non-daily_data.ipynb

* You'll need to ```pip install rpy2```

In [None]:
#%load_ext rpy2.ipython
from fbprophet import Prophet
import logging
logging.getLogger('fbprophet').setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore")

In [None]:
# https://coinmarketcap.com/api/
exchange_id = ex_cfg.POLONIEX
asset = Asset(coins.ETH, coins.BTC)
start = datetime.datetime(year=2016, month=1, day=1)
end = datetime.datetime(year=2018, month=1, day=1)
timeframe = Timeframe.ONE_DAY
exchange = load_exchange(exchange_id)
df = get_price_data(asset, exchange_id, timeframe, start, end)
df.head()

In [None]:
target_period = 24
prior_periods = 12

close_col = ohlcv_feed.get_col_name('close', asset.symbol, exchange_id)
volume_col = ohlcv_feed.get_col_name('volume', asset.symbol, exchange_id)
columns = [close_col]

close_utc = df[[close_col, 'utc']]
plot_price(close_utc, 'close', asset, exchange_id)

split_date =  datetime.datetime(year=2017, month=9, day=1)
train = close_utc[close_utc['utc'] < split_date]
val = close_utc[close_utc['utc'] >= split_date]

In [None]:
close_utc.columns = ['y','ds']
close_utc.head()

In [None]:
m = Prophet(changepoint_prior_scale=0.01)
m.fit(close_utc);

In [None]:
future = m.make_future_dataframe(periods=60, freq='1800s')
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
m.plot(forecast);

In [None]:
m.plot_components(forecast);