In [76]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
import numpy as np
from pyspark.sql import SparkSession
from utils.data_preparation import split_data_lstm
from pyspark.sql.types import *
import pandas as pd
import json
import os
from pyspark.ml.feature import VectorAssembler, StandardScaler
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("lstm").getOrCreate()

24/05/14 19:30:55 WARN Utils: Your hostname, alber-victus resolves to a loopback address: 127.0.1.1; using 192.168.1.25 instead (on interface wlp4s0)
24/05/14 19:30:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/14 19:30:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [84]:
def split_data_lstm(stock,train_size=0.8,spark=None,emotion=False):


    schema = StructType([

    StructField('date',StringType(),True),
    StructField('afinn_sentiment',FloatType(),True),
    StructField('pnn_sentiment',FloatType(),True),
    StructField('price_percent_change',FloatType(),True),
    StructField('volume_percent_change',FloatType(),True),
    StructField('next_day_price_percent_change_shifted',FloatType(),True)
    
    ])
    
    'date', 'afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change', 'next_day_price_percent_change_shifted'
    df = spark.read.schema(schema).csv("../data/csv/"+stock+"/")


    # scale volumne 
    assembler = VectorAssembler(inputCols=['volume_percent_change'], outputCol="features")

    # Transform the data
    data = assembler.transform(df)
    
    # Initialize the StandardScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaled_volume_percent_change", withMean=True, withStd=True)
    
    # Compute summary statistics by fitting the StandardScaler
    scaler_model = scaler.fit(data)
    
    # Scale features
    scaled_data = scaler_model.transform(data)
    
    firstelement = F.udf(lambda v:float(v[0]),FloatType())
    df = scaled_data.withColumn("volume_percent_change", firstelement("scaled_volume_percent_change"))
    
    df = df.select('date',
     'afinn_sentiment',
     'pnn_sentiment',
     'price_percent_change',
     'volume_percent_change',
     'next_day_price_percent_change_shifted')
    
    n = df.count()
    train_size = int(n*train_size)

    train_data = df.limit(train_size)
    test_data = df.subtract(train_data)

    if emotion:
        X_train = train_data.select('afinn_sentiment', 'pnn_sentiment', 'price_percent_change','volume_percent_change')
        y_train = train_data.select("price_percent_change")
    
    
        X_train = np.array(X_train.rdd.map(lambda x: [x.afinn_sentiment,x.pnn_sentiment,x.price_percent_change,x.volume_percent_change]).collect())
        y_train = np.array(y_train.rdd.map(lambda x: [x.price_percent_change]).collect())
    
    
        X_test = test_data.select('afinn_sentiment', 'pnn_sentiment', 'price_percent_change','volume_percent_change')
        y_test = test_data.select("price_percent_change")
    
        X_test = np.array(X_test.rdd.map(lambda x: [x.afinn_sentiment,x.pnn_sentiment,x.price_percent_change,x.volume_percent_change]).collect())
        y_test = np.array(y_test.rdd.map(lambda x: [x.price_percent_change]).collect())

    else:
        
        X_train = train_data.select('price_percent_change','volume_percent_change')
        y_train = train_data.select("price_percent_change")
    
    
        X_train = np.array(X_train.rdd.map(lambda x: [x.price_percent_change,x.volume_percent_change]).collect())
        y_train = np.array(y_train.rdd.map(lambda x: [x.price_percent_change]).collect())
    
    
        X_test = test_data.select('price_percent_change','volume_percent_change')
        y_test = test_data.select("price_percent_change")
    
        X_test = np.array(X_test.rdd.map(lambda x: [x.price_percent_change,x.volume_percent_change]).collect())
        y_test = np.array(y_test.rdd.map(lambda x: [x.price_percent_change]).collect())
    # sc = spark.sparkContext
    # simple_rdd = to_simple_rdd(sc, X_train,y_train)


    return X_train, y_train, X_test, y_test

In [85]:
def r_squared(y_true,y_pred):

    r = 1 - np.sum((y_true-y_pred)**2)/(np.sum((y_true-y_true.mean())**2))
    return r

def train_lstm_regression(stock,train_size=0.9,spark=None,emotion=False,input_shape=10,batch_size=32,epoch=100):
    

    X_train, y_train, X_test, y_test = split_data_lstm(stock,train_size=train_size,spark=spark,emotion=emotion)

    train_data_generator = TimeseriesGenerator(
        data=X_train,
        targets=y_train,
        length=input_shape,  # Adjust the length as needed
        batch_size=batch_size,
        start_index=0,     # Start from the beginning of the time series
        end_index=None,   # End at the last available index
        shuffle=True
    )
    test_data_generator = TimeseriesGenerator(
        data=X_test,
        targets=y_test,
        length=input_shape,  # Adjust the length as needed
        batch_size=batch_size,
        start_index=0,     # Start from the beginning of the time series
        end_index=None,   # End at the last available index
        shuffle=True
    )

        
    # build model
    num_features = 4 if emotion else 2
    
    model = Sequential()
    
    model = Sequential()
    model.add(LSTM(150, activation='tanh', return_sequences=True,input_shape=(input_shape, num_features)))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(64))
    model.add(Dense(1))
    
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # model.compile(optimizer=Adam(learning_rate=learning_rate),loss='mse')
    
    history = model.fit(train_data_generator,epochs=epoch,validation_data=test_data_generator,batch_size=batch_size,verbose=1)

    # X_test = X_test.reshape(-1,1,num_features)

    sample = X_test[:input_shape]

    

    
    r_2_score = []
    for sample,target in test_data_generator:

        prediction = model.predict(sample)
    
        r_2 = r_squared(target,prediction)
        r_2_score.append(r_2)

    r_2_score = sum(r_2_score)/len(r_2_score)

    results = history.history
    results['test_r_2_score'] = r_2_score

    print(r_2_score)

    return model,results,r_2_score


def train_lstm_classification(stock,train_size=0.9,spark=None,emotion=False,input_shape=10,batch_size=32,epoch=100):
    

    X_train, y_train, X_test, y_test = split_data_lstm(stock,train_size=train_size,spark=spark,emotion=emotion)

    y_train = np.where(y_train<0,0,1)
    y_test = np.where(y_test<0,0,1)
    
    train_size = int(train_size*X_train.shape[0])

    X_valid = X_train[train_size:]
    y_valid = y_train[train_size:]    

    X_train = X_train[:train_size]
    y_train = y_train[:train_size]



    
    train_data_generator = TimeseriesGenerator(
        data=X_train,
        targets=y_train,
        length=input_shape,  # Adjust the length as needed
        batch_size=batch_size,
        start_index=0,     # Start from the beginning of the time series
        end_index=None,   # End at the last available index
        shuffle=True
    )
    
    valid_data_generator = TimeseriesGenerator(
        data=X_valid,
        targets=y_valid,
        length=input_shape,  # Adjust the length as needed
        batch_size=batch_size,
        start_index=0,     # Start from the beginning of the time series
        end_index=None,   # End at the last available index
        shuffle=True
    )
    
    test_data_generator = TimeseriesGenerator(
        data=X_test,
        targets=y_test,
        length=input_shape,  # Adjust the length as needed
        batch_size=batch_size,
        start_index=0,     # Start from the beginning of the time series
        end_index=None,   # End at the last available index
        shuffle=True
    )

        
    # build model
    num_features = 4 if emotion else 2
    
    model = Sequential()
    
    model = Sequential()
    model.add(LSTM(150, activation='tanh', return_sequences=True,input_shape=(input_shape, num_features)))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(64))
    model.add(Dense(1))
    
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    
    # model.compile(optimizer=Adam(learning_rate=learning_rate),loss='mse')
    
    history = model.fit(train_data_generator,epochs=epoch,validation_data=valid_data_generator,batch_size=batch_size,verbose=1)

    accuracy = []

    for sample,target in test_data_generator:

        acc = model.evaluate(sample,target)[1]
        print(acc)
        accuracy.append(acc)

    accuracy = sum(accuracy)/len(accuracy)

    results = history.history
    results['test_accuracy'] = accuracy
    
    return model,results
    

In [86]:
accuracy = {}

for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        key = stock + "_emotion_" + str(emotion)
        
        model,history = train_lstm_classification(stock,spark=spark,epoch=500,emotion=emotion,batch_size=32,input_shape=10)


        with open(results_path,'w') as f:

            json.dump(history,f)
        model.save(model_path)

        accuracy[key] = history['test_accuracy']


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

  saving_api.save_model(


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [89]:
accuracy

{'Apple_emotion_False': 0.4895833333333333,
 'Apple_emotion_True': 0.4635416666666667,
 'NVIDIA_emotion_False': 0.5833333333333334,
 'NVIDIA_emotion_True': 0.515625}

In [87]:
files = os.listdir("../results/")

metrics = {}

for file in files:

    with open("../results/" + file,'r') as f:

        met = json.load(f)
        name = file.split(".")[0]
        key = list(met.keys())[-1]
        print(key)
        metrics[name] = met[key]

test_r_2_score
test_r_2_score
test_accuracy
test_r_2_score
test_accuracy
test_accuracy
test_accuracy
test_r_2_score


In [88]:
metrics

{'Apple_emotion_False': -0.4487219454738757,
 'NVIDIA_emotion_True': -0.413444826453268,
 '_binary_NVIDIA_emotion_False': 0.5833333333333334,
 'NVIDIA_emotion_False': -0.30650175598528956,
 '_binary_Apple_emotion_True': 0.4635416666666667,
 '_binary_Apple_emotion_False': 0.4895833333333333,
 '_binary_NVIDIA_emotion_True': 0.515625,
 'Apple_emotion_True': -0.5288101857538933}

In [35]:
prediction

array([[ 0.43352675],
       [ 0.68121374],
       [ 0.47302502],
       [ 0.5793789 ],
       [ 0.6894058 ],
       [ 0.25517786],
       [ 0.4740654 ],
       [ 0.29401475],
       [-0.40045395],
       [ 0.4537572 ],
       [ 0.614033  ],
       [ 0.5793789 ],
       [ 0.614033  ],
       [ 0.58050704],
       [ 0.53640395],
       [ 0.400666  ],
       [ 0.599293  ],
       [ 0.48238555],
       [ 0.6340104 ],
       [ 0.599293  ],
       [ 0.5697628 ],
       [ 0.71701974],
       [ 0.49570626],
       [ 0.56371063],
       [ 0.6989556 ],
       [ 0.5157368 ],
       [ 0.5264658 ],
       [ 0.55504787],
       [ 0.5138999 ],
       [ 0.7419901 ],
       [ 0.6140329 ],
       [ 0.52023625]], dtype=float32)

In [27]:
history.history['accuracy']

[0.46875,
 0.5078125,
 0.5192307829856873,
 0.5192307829856873,
 0.5228365659713745,
 0.5378605723381042,
 0.534254789352417,
 0.5288461446762085,
 0.5408653616905212,
 0.5324519276618958,
 0.5649038553237915,
 0.5444711446762085,
 0.5661057829856873,
 0.5522836446762085,
 0.578125,
 0.5745192170143127,
 0.5763221383094788,
 0.578125,
 0.598557710647583,
 0.5847355723381042,
 0.5913461446762085,
 0.6171875,
 0.6243990659713745,
 0.6256009340286255,
 0.6286057829856873,
 0.5949519276618958,
 0.596754789352417,
 0.6159855723381042,
 0.6105769276618958,
 0.637620210647583,
 0.6568509340286255,
 0.6802884340286255,
 0.6310096383094788,
 0.6207932829856873,
 0.590745210647583,
 0.590745210647583,
 0.6472355723381042,
 0.6598557829856873,
 0.6604567170143127,
 0.6730769276618958,
 0.7181490659713745,
 0.5456730723381042,
 0.5492788553237915,
 0.5582932829856873,
 0.5745192170143127,
 0.5528846383094788,
 0.5763221383094788,
 0.5991586446762085,
 0.5877403616905212,
 0.6039663553237915,
 0.61

In [8]:
r_2_dict = {}

for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        model_path = "../models/" + stock + "_emotion_" + str(emotion) + ".h5"
        results_path = "../results/" + stock + "_emotion_" + str(emotion) + ".json"

        key = stock + "_emotion_" + str(emotion)
        
        model,history,r_2_score = train_lstm(stock,spark=spark,epoch=500,emotion=emotion,batch_size=32,input_shape=10)

        break

        with open(results_path,'w') as f:

            json.dump(history,f)
        model.save(model_path)

        r_2_dict[key] = r_2_score


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [191]:
# sample = X_test.reshape(-1,1,2)
# sample.shape

In [66]:
schema = StructType([

    StructField('date',StringType(),True),
    StructField('afinn_sentiment',FloatType(),True),
    StructField('pnn_sentiment',FloatType(),True),
    StructField('price_percent_change',FloatType(),True),
    StructField('volume_percent_change',FloatType(),True),
    StructField('next_day_price_percent_change_shifted',FloatType(),True)
    
    ])
df = spark.read.schema(schema).csv("../data/csv/Apple/")


In [67]:
df.volume_percent_change

Column<'volume_percent_change'>

In [83]:
df.show()

+----------+---------------+-------------+--------------------+---------------------+-------------------------------------+
|      date|afinn_sentiment|pnn_sentiment|price_percent_change|volume_percent_change|next_day_price_percent_change_shifted|
+----------+---------------+-------------+--------------------+---------------------+-------------------------------------+
|2017-01-04|      0.8635359|    0.8787879|         -0.11191723|            -0.920926|                           0.50856656|
|2017-01-05|      1.2822199|   0.94871795|          0.50856656|           0.01470386|                            1.1147993|
|2017-01-06|      0.9775525|   0.85714287|           1.1147993|            1.1348376|                            0.9159431|
|2017-01-09|      1.2078797|    0.8636364|           0.9159431|          0.032627705|                          0.100866936|
|2017-01-10|      1.3345554|          1.0|         0.100866936|          -0.93527114|                            0.5373229|
|2017-01