In [65]:
import tensorflow as tf 
import numpy as np 
import matplotlib as mp
mp.use('TkAgg')
import matplotlib.pyplot as plt 
import os as os
import pandas as pd
from ML_tools import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
from sklearn.preprocessing import StandardScaler

In [66]:
spark = SparkSession.builder \
        .appName('Company_Project') \
        .config("spark.jars", "mariadb-java-client-3.1.4.jar")\
        .getOrCreate()

DOCS = spark.read.format("jdbc")\
        .option("url","jdbc:mariadb://localhost:3306/lnd")\
        .option("driver", "org.mariadb.jdbc.Driver")\
        .option("dbtable", "DOCS")\
        .option("user", "ETL")\
        .option("password", os.environ.get('PASS'))\
        .load()

In [67]:
# convert to pandas
w = Window().partitionBy().orderBy(col('Date'))
DOCS_ML = DOCS.withColumn("PrevClose", lag("close", 1, 0).over(w)) \
        .withColumn("Return", (col("close") - col("PrevClose"))/ col("PrevClose")) \
        .withColumn("Date", DOCS.Date.cast(DateType()))
DOCS_pd = DOCS_ML.toPandas()

23/08/04 16:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/04 16:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/04 16:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/04 16:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/08/04 16:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [68]:
DOCS_pd.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,PrevClose,Return
0,2021-06-24,41.169998,53.889999,41.169998,53.0,17305900,DOCS,0.0,
1,2021-06-25,50.0,55.98,48.099998,55.98,5004200,DOCS,53.0,0.056226
2,2021-06-28,56.990002,59.689999,54.16,58.349998,2776700,DOCS,55.98,0.042337
3,2021-06-29,59.799999,65.419998,54.150002,55.189999,4220500,DOCS,58.349998,-0.054156
4,2021-06-30,54.959999,59.66,54.959999,58.200001,3630700,DOCS,55.189999,0.054539


In [69]:
input_data = DOCS_pd[['Open', 'High', 'Low', 'Close', 'Volume']].values
targets =DOCS_pd['Return'].values

In [70]:
T= 10
D= input_data.shape[1]
N= input_data.shape[0]


In [71]:
Ntrain = len(input_data) * 2//3
scalar = StandardScaler()
scalar.fit(input_data[:Ntrain + T])
input_data = scalar.transform(input_data)

In [72]:
x_train = np.zeros((Ntrain, T, D))
y_train = np.zeros(Ntrain)
for t in range(Ntrain):
    x_train[t, :, :] = input_data[t:t+T]
    y_train[t] = (targets[t+T] > 0)

In [73]:
Ntest = N - Ntrain
x_test = np.zeros((Ntest, T, D))
y_test = np.zeros(Ntest)
for t in range(Ntest):
    x_test[t, :, :] = input_data[t:t+T]
    y_test[t] = (targets[t+T] > 0)

In [76]:

i = tf.keras.layers.Input(shape=(T,D))
x = tf.keras.layers.LSTM(50)(i)
o = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(i, o)

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.01), metrics=['accuracy'])



In [77]:
r = model.fit(
    x_train, y_train,
    batch_size=32,
    epochs=300,
    validation_data=(x_test, y_test),
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [80]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [81]:
plt.plot(r.history['accuracy'], label='accuracy')
plt.plot(r.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()