In [None]:

from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime, random, os, time
from collections import deque
from IPython.core.display import display
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization, GRU
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

SEQ_LEN=48
DO_PLOT=False

In [None]:
dataset_location = "/Users/i354746/private/magisterka/datasets/eurusd-m15-2018/EURUSD.csv"

df = pd.read_csv(dataset_location)

print(df.head())

In [None]:
df.rename(columns={
    'Open': 'open', 'Close': 'close',
    'High': 'high', 'Low': 'low',
    'Close': 'close', 'Volume': 'volume',
    "Date": "date", 'Timestamp': 'timestamp', }, inplace=True)

df.drop("volume", 1, inplace=True) # we drop volume as it is not helpful TODO look at it later


df["timestamp"] = df["date"].astype(str) + " " + df["timestamp"]
df.drop("date", 1, inplace=True)
df.rename(columns={'Time': 'timestamp', 'Open': 'open', 'Close': 'close',
                   'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}, inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format=True)
df.fillna(method="ffill", inplace=True)
# df.plot.line(x="timestamp", y="close")


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format=True).astype(int)
df["timestamp"] = df["timestamp"]/1000000000
df.set_index('timestamp', inplace=True)
df = df.astype(float)

# Add additional features
# df['momentum'] = df['volume'] * (df['open'] - df['close'])
# df['avg_price'] = (df['low'] + df['high']) / 2
# df['range'] = df['high'] - df['low']
# df['ohlc_price'] = (df['low'] + df['high'] + df['open'] + df['close']) / 4
# df['oc_diff'] = df['open'] - df['close']


display(df.head())

In [None]:
if DO_PLOT:
    df.copy().plot(subplots=True, layout=(3, 4), figsize=(40, 20), sharex=False)

In [None]:
def plotPercentageChange(df):
    copied=df.copy()
    for col in copied.columns:
        if col != "target": 
            copied[col] = copied[col].pct_change()
    copied.plot(subplots=True, layout=(3, 4), figsize=(40, 20), sharex=False)      

if DO_PLOT:
    plotPercentageChange(df)

In [None]:
shift=4 # 1 hour shift
df["future"]=df["close"].shift(-shift)
df.dropna(inplace=True)
df["return"]= df["close"]-df["future"]
display(df.tail(10))

In [None]:
df.drop("future", 1, inplace=True)
# df["target"]= df["return"].shift(shift)
df['target'] = df['return'].apply(lambda x: 1 if x>0.0 else 0)

display(df.tail(15))

In [None]:
times=sorted(df.index.values)
last_10pct=times[-int(0.1*len(times))]
print(last_10pct, datetime.datetime.fromtimestamp(last_10pct))

In [None]:
validation_df=df[(df.index >= last_10pct)]
df=df[(df.index < last_10pct)]
validation_df.head()

In [None]:
def preprocess_df(df_in):
    df=df_in.copy()
    df.dropna(inplace=True)
    for col in df.columns:
        if col != "target" and col != "return": 
            # df[col] = df[col].pct_change()
            # return
            df[col]=minmax_scale(df[col])
            # print(col, np.amax(df[col]), np.amin(df[col]))
            
    df.dropna(inplace=True)  # cleanup again... jic.

    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), np.array(y)  # return X and y...and make X a numpy array!

In [None]:
train_x, train_y = preprocess_df(df)
validation_x, validation_y = preprocess_df(validation_df)
# print(train_x[0])
# print(f"train data: {len(train_x)} validation: {len(validation_x)}")
# print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
# print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

In [None]:
print(train_x.shape)
print(train_x[0][47])
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{shift}-PRED-{int(time.time())}"  # a unique name for the model

In [None]:
model = Sequential()
model.add(LSTM(50, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(10))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [None]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
# filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
# checkpoint = ModelCheckpoint("models/{}.model".format(filepath), monitor='val_acc', verbose=1, save_best_only=True, mode='max') # saves only the best ones

In [None]:
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard],
)

In [None]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))