In [41]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

c:\users\inves\appdata\local\programs\python\python35\python.exe
3.5.3 (v3.5.3:1880cb95a742, Jan 16 2017, 16:02:32) [MSC v.1900 64 bit (AMD64)]
sys.version_info(major=3, minor=5, micro=3, releaselevel='final', serial=0)


In [1]:
import numpy as np
import pandas as pd
import random
from collections import deque
from sklearn import preprocessing

In [70]:
'''
1. "LTC-USD"
2. "ETH-USD"
3. "BTC-USD"
4. "BCH-USD"
'''
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "ETH-USD"

In [71]:
def classify(current, future):
    if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:  # otherwise... it's a 0!
        return 0

In [72]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!

In [73]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider

In [74]:
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = 'Data/sentdex/{}.csv'.format(ratio)  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": "{}_close".format(ratio), "volume": "{}_volume".format(ratio)}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[["{}_close".format(ratio), "{}_volume".format(ratio)]]  # ignore the other columns besides price and volume
    print(df.head())
    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.head()

BTC-USD
            BTC-USD_close  BTC-USD_volume
time                                     
1528968660    6489.549805        0.587100
1528968720    6487.379883        7.706374
1528968780    6479.410156        3.088252
1528968840    6479.410156        1.404100
1528968900    6479.979980        0.753000
LTC-USD
            LTC-USD_close  LTC-USD_volume
time                                     
1528968660      96.580002        9.647200
1528968720      96.660004      314.387024
1528968780      96.570000       77.129799
1528968840      96.500000        7.216067
1528968900      96.389999      524.539978
BCH-USD
            BCH-USD_close  BCH-USD_volume
time                                     
1528968660     871.719971        5.675361
1528968720     870.859985       26.856577
1528968780     870.099976        1.124300
1528968840     870.789978        1.749862
1528968900     870.000000        1.680500
ETH-USD
            ETH-USD_close  ETH-USD_volume
time                                     
15

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,871.719971,5.675361,,
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759


In [75]:
main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
#print(main_df.head())  # how did we do??

main_df['future'] = main_df['{}_close'.format(RATIO_TO_PREDICT)].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df['{}_close'.format(RATIO_TO_PREDICT)], main_df['future']))

#print(main_df.head())

times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%


In [76]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"] 

In [77]:
for ratio in ratios:  # begin iteration

    ratio = ratio.split('.csv')[0]  # split away the ticker from the file-name
    dataset = 'Data/sentdex/{}.csv'.format(ratio)  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": "{}_close".format(ratio), "volume": "{}_volume".format(ratio)}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[["{}_close".format(ratio), "{}_volume".format(ratio)]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

In [78]:
main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
#print(main_df.head())  # how did we do??

main_df['future'] = main_df['{}_close'.format(RATIO_TO_PREDICT)].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df['{}_close'.format(RATIO_TO_PREDICT)], main_df['future']))

main_df.dropna(inplace=True)

## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print("train data: {} validation: {}".format(len(train_x), len(validation_x)))
print("Dont buys: {}, buys: {}".format(train_y.count(0), train_y.count(1)))
print("VALIDATION Dont buys: {}, buys: {}".format(validation_y.count(0), validation_y.count(1)))

train data: 83428 validation: 4120
Dont buys: 41714, buys: 41714
VALIDATION Dont buys: 2060, buys: 2060


In [79]:
import time

EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = "{}-SEQ-{}-PRED-{}".format(SEQ_LEN, FUTURE_PERIOD_PREDICT, int(time.time()))  # a unique name for the model

In [80]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

In [81]:
model = Sequential()
model.add(LSTM(128, activation = 'relu', input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, activation = 'relu', return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

In [82]:
opt = keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [83]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
# unique file name that will include the epoch and the validation acc for that epoch
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  
# saves only the best ones
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', 
                                                      verbose=1, save_best_only=True, mode='max')) 

In [None]:
# Train model
history = model.fit(
    train_x, np.array(train_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, np.array(validation_y)),
    callbacks=[tensorboard, checkpoint],
)

Train on 83428 samples, validate on 4120 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
13120/83428 [===>..........................] - ETA: 465s - loss: 0.6915 - acc: 0.5183

To launch TensorBoard:
1. launch cmd in windows
2. cd to your directory
3. type in "tensorboard --logdir=logs"
4. wait for the prompt for your address. For me it's usually: http://ChiInvestments:6006