<a href="https://colab.research.google.com/github/akankshanehete/RNNtesting/blob/main/RNNTestingTimeSeries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing relevant libraries
import pandas as pd
import numpy as np 
from google.colab import drive
from sklearn import preprocessing
from collections import deque
import random
import time 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [None]:
# starting constants 
# use the last 60 min to predict the next 3 minutes of stock price
SEQ_LEN = 60
# how many periods forward should the model predict?
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = 'LTC-USD'
EPOCHS = 10
BATCH_SIZE =64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [None]:
# defining a function to create target column for binary classification
def classify(current, future):
  # in general, with this sequence of features, the price will go up
  if float(future) > float(current):
    return 1
  # in general, with this sequene of features, the price goes down
  else: 
    return 0

In [None]:
drive.mount('/content/gdrive')
path='/content/gdrive/My Drive/Colab Notebooks/CryptoDataRNN/'
df = pd.read_csv(path +'LTC-USD.csv',names=['time', 'low', 'high','open' ,'close', 'volume'])


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
main_df = pd.DataFrame()
ratios = ['BTC-USD', 'LTC-USD','ETH-USD','BCH-USD']
for ratio in ratios:
  dataset =f'{path}{ratio}.csv'
  df= pd.read_csv(dataset, names=['time', 'low', 'high','open' ,'close', 'volume'])
  df.rename(columns={'close': f"{ratio}_close", 'volume': f"{ratio}_volume"}, inplace=True)
  df.set_index('time', inplace=True)
  df = df[[f"{ratio}_close",f"{ratio}_volume"]]
  if(len(main_df) == 0):
    main_df = df
  else: 
    main_df = main_df.join(df)



In [None]:
# adding feature for what the price will be 3 periods (3 min) into the future
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
main_df.head(10)

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361,96.5,0
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,486.0,7.5033,869.98999,1.669014,96.400002,0
1528969020,6477.220215,2.73195,96.440002,95.524078,485.98999,85.877251,869.450012,0.8652,96.400002,0
1528969080,6480.0,2.17424,96.470001,175.205307,485.98999,160.915192,869.98999,23.534929,96.400002,0
1528969140,6479.990234,0.9031,96.400002,43.652802,485.98999,61.371887,870.0,2.3,96.400002,0
1528969200,6478.660156,3.258786,96.400002,8.16,485.98999,42.687656,870.320007,9.255514,96.400002,0


In [None]:
# create function for normalizing the data (scaling data and replacing with percent change from the previous time)
def preprocess_df(df):
    df = df.drop("future", 1) 

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN) 

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells

    lower = min(len(buys), len(sells))  # determinig the shorter length

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!


In [None]:
# normalizing and creating sequences for RNN
times = sorted(main_df.index.values)
last_5pct = times[-(int(0.05*len(times)))]
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)


  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


[[[-1.09261202e+00  1.65900741e-01 -5.80202556e-01 ... -4.29748338e-02
   -1.19047948e-02 -5.66664619e-03]
  [ 4.52859205e-01 -7.59543493e-02  4.23424699e-01 ... -5.44099799e-02
    1.24153980e-02 -6.22854254e-03]
  [-4.53111985e-01 -7.53418506e-02 -7.84974085e-02 ...  6.24288051e-03
   -3.79768003e-03 -6.20413944e-03]
  ...
  [ 8.00972034e-01  1.77460326e-01  6.69827430e-01 ... -7.82355568e-03
    2.46878482e+00 -4.80084983e-03]
  [ 3.28727134e-01 -1.84636728e-03  1.49941295e+00 ... -1.24600580e-02
   -6.13683614e-01 -6.21620203e-03]
  [-2.33375898e+00 -2.43367921e-02 -3.97199866e+00 ... -3.30750756e-02
   -1.58615768e+00 -6.22206286e-03]]

 [[-4.39293795e-01 -4.90628025e-02  1.03012008e-01 ... -3.84200903e-02
   -5.00621138e-01 -6.27103588e-03]
  [-2.18130650e-04 -7.47949499e-02  5.08305669e-03 ... -5.12537675e-02
    4.30881216e-03 -6.29687776e-03]
  [ 4.40780565e-01 -1.70468800e-02  5.08305669e-03 ... -5.43917584e-02
    1.44247874e-02 -6.23397740e-03]
  ...
  [-2.18130650e-04 -3.0

In [None]:
# building the model 
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# adding layers to the model
model.add(LSTM(128, return_sequences=True, activation ='relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.2))

# use softmax activation function for output layer, since it is binary classification 2 nodes are used
model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

# tensorboard callback
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [None]:
#training the model 
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
validation_x = np.asarray(validation_x)
validation_y = np.asarray(validation_y)
model.fit(
    train_x, train_y, 
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y)
)


Epoch 1/10