In [10]:
import os
import pandas as pd 
from collections import deque
import random 
import time
from sklearn import preprocessing
import numpy as np 

import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [11]:
CRYPTO_TO_PREDICT = "BTC-USD" # use any from data/
SEQ_LEN = 60  
FUTURE_PERIOD_PREDICT = 3  


In [12]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

## DATASET PREPROCESSING

In [13]:
def preprocess_df(df):
    df = df.drop("future", axis=1)  
    
    for col in df.columns:  
        if col != "target":  
            df[col] = df[col].pct_change()  
            df.dropna(inplace=True)  
            df[col] = preprocessing.scale(df[col].values)  
    
    df.dropna(inplace=True)
    
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)

    for i in df.values:  
        prev_days.append([n for n in i[:-1]]) 
        if len(prev_days) == SEQ_LEN:  
            sequential_data.append([np.array(prev_days), i[-1]])
    
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    for seq, target in sequential_data:  
        if target == 0:  
            sells.append([seq, target]) 
        elif target == 1:  
            buys.append([seq, target])  
            
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))    
    
    buys=buys[:lower]
    sells = sells[:lower]
            
    sequential_data = buys+sells 
    random.shuffle(sequential_data)
    
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y

In [14]:
merged_df = pd.DataFrame()

cryptos = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"] 
for crypto in cryptos:  

    # crypto = cryptos.split('.csv')[0]  
    dataset = f'data/{crypto}.csv'  
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  

    # rename volume and close 
    df.rename(columns={"close": f"{crypto}_close", "volume": f"{crypto}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index 
    df = df[[f"{crypto}_close", f"{crypto}_volume"]]  

    if len(merged_df)==0: 
        merged_df = df  
    else:  
        merged_df = merged_df.join(df)

merged_df.fillna(method="ffill", inplace=True)  

merged_df['future'] = merged_df[f'{CRYPTO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
merged_df['target'] = list(map(classify, merged_df[f'{CRYPTO_TO_PREDICT}_close'], merged_df['future']))

merged_df.dropna(inplace=True)

times = sorted(merged_df.index.values)
last_5pct = sorted(merged_df.index.values)[-int(0.05*len(times))]

validation_merged_df = merged_df[(merged_df.index >= last_5pct)]
merged_df = merged_df[(merged_df.index < last_5pct)]

X_train, y_train = preprocess_df(merged_df)
X_val, y_val = preprocess_df(validation_merged_df)

print(f"train data: {len(X_train)}  || validation: {len(X_val)}")
print(f"Dont buys: {y_train.count(0)}, buys: {y_train.count(1)}")
print(f"VALIDATION Dont buys: {y_val.count(0)}, buys: {y_val.count(1)}")

  merged_df.fillna(method="ffill", inplace=True)


train data: 83156  || validation: 4478
Dont buys: 41578, buys: 41578
VALIDATION Dont buys: 2239, buys: 2239


## RNN-MODEL

In [15]:
EPOCHS = 10  
BATCH_SIZE = 64  
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}" 

model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

tensorBoard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "models/RNN_Final-{epoch:02d}-{val_accuracy:.3f}.keras"
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True,
    mode="max"
)



  super().__init__(**kwargs)


In [16]:
print(f"X_train shape: {X_train.shape}, y_train shape: {len(y_train)}")
print(f"X_val shape: {X_val.shape}, y_val shape: {len(y_val)}")


X_train shape: (83156, 60, 8), y_train shape: 83156
X_val shape: (4478, 60, 8), y_val shape: 4478


In [17]:
y_train = np.array(y_train)
y_val = np.array(y_val)


In [18]:
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=[tensorBoard, checkpoint],
)

Epoch 1/10
[1m1299/1300[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 33ms/step - accuracy: 0.5213 - loss: 0.7502
Epoch 1: val_accuracy improved from -inf to 0.55895, saving model to models/RNN_Final-01-0.559.keras
[1m1300/1300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 34ms/step - accuracy: 0.5213 - loss: 0.7501 - val_accuracy: 0.5590 - val_loss: 0.6830
Epoch 2/10
[1m1300/1300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5631 - loss: 0.6827
Epoch 2: val_accuracy improved from 0.55895 to 0.56655, saving model to models/RNN_Final-02-0.567.keras
[1m1300/1300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 33ms/step - accuracy: 0.5631 - loss: 0.6827 - val_accuracy: 0.5665 - val_loss: 0.6771
Epoch 3/10
[1m1299/1300[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 33ms/step - accuracy: 0.5686 - loss: 0.6806
Epoch 3: val_accuracy did not improve from 0.56655
[1m1300/1300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s