references: 

https://www.youtube.com/watch?v=ne-dpRdNReI&t=209s

https://www.youtube.com/watch?v=yWkpRdpOiPY&t=293s

https://pythonprogramming.net/cryptocurrency-recurrent-neural-network-deep-learning-python-tensorflow-keras/

Note: I'm not using f-strings because it'll require python 3.6+

In [27]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time

In [63]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization #CuDNNLSTM
#set when we want to have checkpoint, can be for validation accuracy, loss, etc (save when there's a max and avoid overfitting problem)
from keras.callbacks import TensorBoard, ModelCheckpoint 

In [8]:
df = pd.read_csv('Data\sentdex\LTC-USD.csv', 
                 names = ['time', 'low', 'high', 'open', 'close', 'volume'])
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [16]:
main_df = pd.DataFrame()
ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']
for ratio in ratios:
    dataset = "Data/sentdex/{}.csv".format(ratio)
    
    df = pd.read_csv(dataset, names = ['time', 'low', 'high', 'open', 'close', 'volume'])
    
    df.rename(columns = {'close': '{}_close'.format(ratio), 'volume': '{}_volume'.format(ratio)}, inplace = True)
    df.set_index('time', inplace = True)
    df = df[['{}_close'.format(ratio), '{}_volume'.format(ratio)]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
    
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

Here's what we need to think about:
- SEQ_LEN: Using the last how ever long sequence
- FUTURE_PERIOD_PREDICT: how many future period to predict? in this instance, each period is 1 minute
- RATIO_TO_PREDICT: what are we going to predict?

In [65]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = 'LTC-USD'
EPOCHS = 10
BATCH_SIZE = 64
NAME = '{}-SEQ-{}-PRED-{}'.format(SEQ_LEN, FUTURE_PERIOD_PREDICT, int(time.time()))

In [18]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [21]:
main_df['future'] = main_df['{}_close'.format(RATIO_TO_PREDICT)].shift(-FUTURE_PERIOD_PREDICT)
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361,96.5
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577,96.389999
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243,96.519997
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862,96.440002
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805,96.470001


In [22]:
main_df['target'] = list(map(classify, main_df['{}_close'.format(RATIO_TO_PREDICT)], main_df['future']))
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361,96.5,0
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805,96.470001,1


Separate out out-of-sample data

We can't just shuffle and take a %. Reason being that with this method, there will be training data that are very very close to the testing data. As we overfit our training dataset, we will have an illusion of a good testing dataset performance, as these two dataset will look very similar.

For sequential data, we need a chunck. Out-of-sample data needs to be those that are recent. In other words, we need a % of in sequence data (most recent chunck), separate those out to be out out-of-sample test. This will be the same as a forward test.

In [29]:
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))] #the most recent 5% would be the testing dataset

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

Balance, scale, etc

In [74]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy a

In [75]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [90]:
validation_x.shape

(3062, 60, 8)

In [77]:
print('train data: {} validation: {}'.format(len(train_x), len(validatin_x)))
print('Don\'t buys: {}, buys: {}'.format(train_y.count(0), train_y.count(1)))
print('Validation don\'t buys: {}, buys: {}'.format(validation_y.count(0), validation_y.count(1)))

train data: 69188 validation: 3062
Don't buys: 34594, buys: 34594
Validation don't buys: 1531, buys: 1531


In [93]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True, activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True, activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = keras.optimizers.Adam(lr = 0.001, decay = 1e-6)

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [80]:
tensorboard = TensorBoard(log_dir = 'Models/logs/{}'.format(NAME))
filepath = 'RNN_Final-{epoch:02d}-{val_acc:.3f}'
checkpoint = ModelCheckpoint('Models/{}.model'.format(filepath, monitor = 'val_acc', verbose = 1, save_best_only = True, mode = 'max'))

In [94]:
history = model.fit(train_x, train_y, batch_size = BATCH_SIZE, epochs = EPOCHS,
                   validation_data = (validation_x, validation_y),
                   callbacks = [tensorboard, checkpoint])

ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 69188 arrays: [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,...