## Data Preparation

#### Taking a quick look at the data

In [None]:
import pandas as pd
df = pd.read_csv('crypto_data/LTC-USD.csv', names=['time', 'low', 'high', 'open', 'close', 'volume'])

df.head()

#### Build a dataframe by combining the csv files

In [None]:
main_df = pd.DataFrame()

ratios = ['BTC-USD','LTC-USD','ETH-USD','BCH-USD']

for ratio in ratios:
    
    dataset = f'crypto_data/{ratio}.csv'
    df=pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    
    df.rename(columns={'close': f"{ratio}_close", 'volume':f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)
    df = df.loc[:, [f"{ratio}_close", f"{ratio}_volume"]]
    
    main_df=pd.merge(main_df, df, left_index=True, right_index=True, how='outer')

main_df.head()

#### Create a new target column which indicates whether price increases or decreases in future

In [None]:
SEQ_LEN = 60     # number of minutes from the past used for prediction
FUTURE_PERIOD_PREDICT = 3    # number of minutes into the future for prediction
RATIO_TO_PREDICT = 'LTC-USD' # item to predict

In [None]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

# create a new 'future' column which shows price values 3 minutes from the timestamp
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

# creates a target column, which contains boolean values indicating whether future is higher
main_df['target']=list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df['future']))

# Using time as index and sort by time
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))]

# Slice out the last 5 percent of data (in order of time) as validation data
validation_main_df = main_df[main_df.index >= last_5pct]
main_df = main_df[main_df.index < last_5pct]

print(main_df.shape, validation_main_df.shape)

#### Create sequence of the data: ie each data point would be the past 60 minutes of price data.
#### Balancing the data to make sure classes have equal amounts when training (so the model doesnt always just predict the class with highest occurance in the training data).

In [None]:
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

def preprocess_df(df):
    '''Takes a Pandas dataframe as parameter.
        Remove a column named "future"
        Change columns into percent change values
        Produces sequential data
        Balance the sequential data
        Return X and y as feature matrix and targets'''
    
    df = df.drop('future', axis=1)    # 'future' column no longer needed
    
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change()  # change values to percent change from previous values
            df[col] = preprocessing.scale(df[col].values)  # rescale the value to 0-1
    
    df.dropna(axis=0, how='any', inplace=True)
        
    sequential_data=[]
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:              # each i represents a row of values
        prev_days.append(i[:-1])     # append everything except for the last 'target' value
        
        # produces a sequence of data using the time interval specified
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]]) 
            # append features, target 
            # Note that the prev_days has features of the previous time intervals, but only the target of the current time stamp
            
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    # Based on the target value, split training data into buys and sells
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    # balance the buys and sells
    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]
    
    # re-combine the balanced data and shuffle
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y


In [None]:
X_train, y_train = preprocess_df(main_df)
X_test, y_test = preprocess_df(validation_main_df)

In [None]:
print(f"train data: {len(X_train)}, test data: {len(X_test)}")
print(f"Training Targets: buys: {y_train.count(1)}, sells: {y_train.count(0)}")
print(f"Test Targets: buys: {y_test.count(1)}, sells: {y_test.count(0)}")

## Training

In [None]:
import keras
import time

EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

#### If using NVDA GPU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

#### If using AMD GPU (With plaidml installed)

In [None]:
import os
os.environ['KERAS_BACKEND']='plaidml.keras.backend'
# When using plaidml, the libraries are imported from keras instead of tensorflow
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard, ModelCheckpoint

#### Set up RNN model

In [None]:
model = Sequential()

model.add(LSTM(128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(X_train.shape[1:]), return_sequences=False))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f"logs/{NAME}")

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}" # adds validation accuracy into the filename for each epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))


In [None]:
history = model.fit(
            X_train, y_train,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            validation_data=(X_test, y_test),
            callbacks=[tensorboard, checkpoint])


In [None]:
if not os.path.exists('models'):
        os.makedirs('models')
model.save("models/{NAME}")