In [16]:
# Get tensorboard running in the background
LOG_DIR = '/tmp/log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

# Download and unzip ngrok
! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip ngrok-stable-linux-amd64.zip

# Launch ngrok background process
get_ipython().system_raw('./ngrok http 6006 &')

# Retrive public url
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

--2018-10-27 03:27:11--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.203.66.95, 52.207.5.158, 52.203.102.189, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.203.66.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.9’


2018-10-27 03:27:13 (3.47 MB/s) - ‘ngrok-stable-linux-amd64.zip.9’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ngrok                   
http://f93ebff0.ngrok.io


In [20]:
# Load the Drive helper and mount
from google.colab import drive

# This will proot for authorization
drive.mount('/content/drive')

import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

SEQ_LEN = 60 # how long of a proceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3 # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "ETH-USD"
EPOCHS = 10 # how many passes through our data
BATCH_SIZE = 64 # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

def classify(current, future):
  if float(future) > float(current): # if the future price is higher than the current, that's a buy, or a 1.
    return 1
  else: # otherwise... it's a 0!
    return 0

def preprocess_df(df):
  df = df.drop('future', 1) # don't need this anymore
  
  for col in df.columns: # go through all of the columns
    if col != "target": # normalize all ... except for the target itself
      df[col] = df[col].pct_change() # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
      df.dropna(inplace=True) # remove the nas created by pct_change
      df[col] = preprocessing.scale(df[col].values) # scale between 0 and 1
      
  df.dropna(inplace=True) # cleanup again... jic. Those nasty NaNs love to creep in.
  
  sequential_data = [] # this is a list that will CONTAIN the sequences
  prev_days = deque(maxlen=SEQ_LEN) # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
  
  for i in df.values: # iterate over the values
    prev_days.append([n for n in i[:-1]]) # store all but the target
    if len(prev_days) == SEQ_LEN: # make sure we have 60 sequences
      sequential_data.append([np.array(prev_days), i[-1]]) # append those bad boys
      
  random.shuffle(sequential_data) # shuffle for good measure
  
  buys = [] # list that will store our buy sequences and targets
  sells = [] # list that will store our sell sequences and targets
  
  for seq, target in sequential_data: # iterate over the sequential data
    if target == 0: # if it's a "not buy"
      sells.append([seq, target]) # append to sells list
    elif target == 1: # otherwise if the target is a 1...
      buys.append([seq, target]) # it's a buy
      
  random.shuffle(buys) # shuffle the buys
  random.shuffle(sells) # shuffle the sells
  
  lower = min(len(buys), len(sells)) # what's the shorted length?
  
  buys = buys[:lower] # make sure both lists are only up to the shortest length
  sells = sells[:lower] # make sure both lists are onyl up to the shortest length
  
  sequential_data = buys+sells # add them together
  random.shuffle(sequential_data) # another shuffle, so the model doesn't get confused will all 1 class then the other.
  
  X = []
  y = []
  
  for seq, target in sequential_data: # going over our new sequential data
    X.append(seq) # X is the sequences
    y.append(target) # y is the targets/labels (buys vs sell/not buy)
  
  return np.array(X), y # return X and y...and make X a numpy array!
  
# take featuresets and combine them into sequences of 60 of these featuresets

main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"] # the 4 ratios we want to consider
for ratio in ratios: # begin iteration
  dataset = f"/content/drive/My Drive/Colab Notebooks/crypto/crypto_data/{ratio}.csv" # get full path to the file
  
  df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"]) # read in specific file
  
  # renaming volume and close to include the ticker so we can see which close/volume is which
  df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
  
  df.set_index("time", inplace=True) # set time as index so we can join them on this shared time
  df = df[[f"{ratio}_close", f"{ratio}_volume"]] # ignore the other columns besides price and volume
  
  if len(main_df) == 0: # if the dataframe is empty
    main_df = df # then it's just the
  else: # otherwise, join this data to the main one
    main_df = main_df.join(df)
  
main_df["future"] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

main_df["target"] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

# print(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head(10))

# Scale, normalize and put data in sequences

times = sorted(main_df.index.values) # get the times
last_5pct = times[-int(0.05*len(times))] # get the last 5% of the times
# print(last_5pct)

validation_main_df = main_df[(main_df.index >= last_5pct)] # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)] # now the main_df is all the data up to the last 5%

# preprocess_df(main_df)
# preprocess the data
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

# print some stats
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu")) # use tnh for activation func if not using CuDNNLSTM
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'/tmp/log/{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("/content/drive/My Drive/Colab Notebooks/crypto/models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(train_x, train_y, 
                    batch_size=BATCH_SIZE, 
                    epochs=EPOCHS, 
                    validation_data=(validation_x, validation_y), 
                    callbacks=[tensorboard, checkpoint])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
train data: 74196 validation: 3260
Dont buys: 37098, buys: 37098
VALIDATION Dont buys: 1630, buys: 1630
Train on 74196 samples, validate on 3260 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
