In [None]:
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dropout, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt

import tensorflow as tf


In [None]:
drive.mount("my-drive")

# Load Dataset

### Read

In [None]:
file_name = "my-drive/MyDrive/ACB_2015.csv"

df = pd.read_csv(file_name)

print(df)

In [None]:
print(df['close'])

### Visualization


In [None]:
plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),(df['close']))
plt.xticks(range(0,df.shape[0],1951),df['date'].loc[::1951])
plt.xlabel('Date',fontsize=18)
plt.ylabel('Close Price',fontsize=18)
plt.show()

# Split Train & Validation & Test

In [None]:
prices = df['close'].to_numpy()

df_length = df.shape[0]
train_length = df_length * 90 // 100
test_length = df_length * 10 // 100

print("Train: {} \n\nTest: {}".format(train_length, test_length))

train_data = prices[:train_length]
test_data = prices[train_length:]

# Process

## One-Step Ahead Prediction via Averaging

###  Standard Average 

In [None]:
window_size = 20
N = train_data.size
std_avg_predictions = []
std_avg_x = []
mse_errors = []

for pred_idx in range(window_size,N):

    if pred_idx >= N:
        date = dt.datetime.strptime(k, '%Y-%m-%d').date() + dt.timedelta(days=1)
    else:
        date = df.loc[pred_idx,'date']

    std_avg_predictions.append(np.mean(train_data[pred_idx-window_size:pred_idx]))
    mse_errors.append((std_avg_predictions[-1]-train_data[pred_idx])**2)
    std_avg_x.append(date)

In [None]:
plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),df['close'],color='b',label='True')
plt.plot(range(window_size,N),std_avg_predictions,color='orange',label='Prediction')
#plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=45)
plt.xlabel('Date')
plt.ylabel('Standard Average Price')
plt.legend(fontsize=18)
plt.show()

### Exponential Moving Average

In [None]:
N = train_data.size

run_avg_predictions = []
run_avg_x = []

mse_errors = []

running_mean = 0.0
run_avg_predictions.append(running_mean)

decay = 0.5

for pred_idx in range(1,N):

    running_mean = running_mean*decay + (1.0-decay)*train_data[pred_idx-1]
    run_avg_predictions.append(running_mean)
    mse_errors.append((run_avg_predictions[-1]-train_data[pred_idx])**2)
    run_avg_x.append(date)

print('MSE error for EMA averaging: %.5f'%(0.5*np.mean(mse_errors)))

In [None]:
plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),df['close'],color='b',label='True')
plt.plot(range(0,N),run_avg_predictions,color='orange', label='Prediction')
#plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=45)
plt.xlabel('Date')
plt.ylabel('Exponential Moving Average Price')
plt.legend(fontsize=18)
plt.show()

### Find baseline

In [None]:
def find_base_line(file_name, stock_name):
  df = pd.read_csv(file_name).sort_values('date')
  prices = df['close'].to_numpy()

  df_length = df.shape[0]
  train_length = df_length * 90 // 100
  test_length = df_length * 10 // 100

  train_data = prices[:train_length]
  test_data = prices[train_length:]

  N = test_data.size

  run_avg_predictions = []
  run_avg_x = []

  mse_errors = []

  running_mean = df['close'][train_length]
  run_avg_predictions.append(running_mean)

  decay = 0.5

  for pred_idx in range(1,N):
      running_mean = running_mean*decay + (1.0-decay)*test_data[pred_idx-1]
      run_avg_predictions.append(running_mean)
      mse_errors.append((run_avg_predictions[-1]-test_data[pred_idx])**2)
      run_avg_x.append(date)

  plt.figure(figsize = (18,9))
  plt.plot(range(test_length+1),df[train_length:]['close'],color='b',label='True')
  plt.plot(range(0,N),run_avg_predictions,color='orange', label='Prediction')
  plt.xlabel(stock_name)
  plt.ylabel('Exponential Moving Average Price')
  plt.legend(fontsize=18)
  plt.show()

  print("\n")
  print("============== {} ==============".format(stock_name))
  print("MSE: {}".format(mean_squared_error(df[train_length:]['close'], run_avg_predictions)))
  print("MAPE: {}".format(mean_absolute_percentage_error(df[train_length:]['close'], run_avg_predictions)))
  print("MAE: {}".format(mean_absolute_error(df[train_length:]['close'], run_avg_predictions)))
  print("\n")


In [None]:
stock_list = ["ACB", "BID", "BVH", "CTG", "FPT", "AGR"]
file_name_list = []

for s in stock_list:
  file_name_list.append("my-drive/MyDrive/" + s + "_2015.csv")


for i in range(0, len(file_name_list)):
  find_base_line(file_name_list[i], stock_list[i])


## LSTM

In [None]:
train_data = train_data.reshape(-1,1)
test_data = test_data.reshape(-1,1)

In [None]:
time_steps = 3
x_train = []
y_train = []

for i in range(time_steps, train_data.shape[0]):
    x_train.append(train_data[i - time_steps:i])
    y_train.append(train_data[i, 0])

x_train = np.array(x_train)
y_train = np.array(y_train)



x_test = []
y_test = []

for i in range(time_steps, test_data.shape[0]):
    x_test.append(test_data[i - time_steps:i])
    y_test.append(test_data[i, 0])

x_test = np.array(x_test)
y_test = np.array(y_test)

In [None]:
model = Sequential()
# 1st layer with Dropout regularisation
# * units = add 100 neurons is the dimensionality of the output space
# * return_sequences = True to stack LSTM layers so the next LSTM layer has a three-dimensional sequence input
# * input_shape => Shape of the training dataset
model.add(LSTM(units=100, return_sequences=True, input_shape=(x_train.shape[1], 1)))
# 20% of the layers will be dropped
model.add(Dropout(0.2))
# 2nd LSTM layer
# * units = add 50 neurons is the dimensionality of the output space
# * return_sequences = True to stack LSTM layers so the next LSTM layer has a three-dimensional sequence input
model.add(LSTM(units=50, return_sequences=True))
# 20% of the layers will be dropped
model.add(Dropout(0.2))
# 3rd LSTM layer
# * units = add 50 neurons is the dimensionality of the output space
# * return_sequences = True to stack LSTM layers so the next LSTM layer has a three-dimensional sequence input
model.add(LSTM(units=50, return_sequences=True))
# 50% of the layers will be dropped
model.add(Dropout(0.5))
# 4th LSTM layer
# * units = add 50 neurons is the dimensionality of the output space
model.add(LSTM(units=50))
# 50% of the layers will be dropped
model.add(Dropout(0.5))
# Dense layer that specifies an output of one unit
model.add(Dense(units=1))
model.summary()

In [None]:
defined_metrics = [
    tf.keras.metrics.MeanSquaredError(name='MSE')
]

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=1)

model.compile(optimizer='adam', loss='mean_squared_error', metrics=defined_metrics)
history = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test),
                    callbacks=[callback])

In [None]:
y_predict = model.predict(x_test)

In [None]:
print("MSE: {}".format(mean_squared_error(y_test, y_predict)))
print("MAPE: {}".format(mean_absolute_percentage_error(y_test, y_predict)))
print("MAE: {}".format(mean_absolute_error(y_test, y_predict)))

In [None]:
plt.figure(figsize = (18,9))
print(y_test.shape[0])
plt.plot(range(y_test.shape[0]),y_test,color='b',label='True')
plt.plot(range(y_predict.shape[0]),y_predict,color='orange',label='Prediction')
#plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=45)
plt.xlabel('Date')
plt.ylabel('Predict LSTM')
plt.legend(fontsize=18)
plt.show()

In [None]:
def predict_LSTM(file_name, stock_name):
  df = pd.read_csv(file_name).sort_values('date')
  prices = df['close'].to_numpy()

  df_length = df.shape[0]
  train_length = df_length * 90 // 100
  test_length = df_length * 10 // 100

  train_data = prices[:train_length]
  test_data = prices[train_length:]

  train_data = train_data.reshape(-1,1)
  test_data = test_data.reshape(-1,1)

  time_steps = 3
  x_train = []
  y_train = []

  for i in range(time_steps, train_data.shape[0]):
      x_train.append(train_data[i - time_steps:i])
      y_train.append(train_data[i, 0])

  x_train = np.array(x_train)
  y_train = np.array(y_train)

  x_test = []
  y_test = []

  for i in range(time_steps, test_data.shape[0]):
      x_test.append(test_data[i - time_steps:i])
      y_test.append(test_data[i, 0])

  x_test = np.array(x_test)
  y_test = np.array(y_test)

  model = Sequential()
  model.add(LSTM(units=100, return_sequences=True, input_shape=(x_train.shape[1], 1)))
  model.add(Dropout(0.2))
  model.add(LSTM(units=50, return_sequences=True))
  model.add(Dropout(0.2))
  model.add(LSTM(units=50, return_sequences=True))
  model.add(Dropout(0.5))
  model.add(LSTM(units=50))
  model.add(Dropout(0.5))
  model.add(Dense(units=1))
  model.summary()

  defined_metrics = [
    tf.keras.metrics.MeanSquaredError(name='MSE')
  ]

  callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=1)

  model.compile(optimizer='adam', loss='mean_squared_error', metrics=defined_metrics)
  history = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test),
                      callbacks=[callback])
  
  y_predict = model.predict(x_test)

  plt.figure(figsize = (18,9))
  print(y_test.shape[0])
  plt.plot(range(y_test.shape[0]),y_test,color='b',label='True')
  plt.plot(range(y_predict.shape[0]),y_predict,color='orange',label='Prediction')
  #plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=45)
  plt.xlabel('Date')
  plt.ylabel('Predict LSTM')
  plt.legend(fontsize=18)
  plt.show()

  print("\n")
  print("============== {} ==============".format(stock_name))
  print("MSE: {}".format(mean_squared_error(y_test, y_predict)))
  print("MAPE: {}".format(mean_absolute_percentage_error(y_test, y_predict)))
  print("MAE: {}".format(mean_absolute_error(y_test, y_predict)))
  print("\n")


In [None]:
stock_list = ["BID", "BVH", "CTG", "FPT", "AGR", "ACB"]
file_name_list = []

for s in stock_list:
  file_name_list.append("my-drive/MyDrive/" + s + "_2015.csv")


for i in range(0, len(file_name_list)):
  predict_LSTM(file_name_list[i], stock_list[i])


## Another stock

In [None]:
def evaluate_another_stock(file_name, stock_name):
  df = pd.read_csv(file_name).sort_values('date')

  # plt.figure(figsize = (18,9))
  # plt.plot(range(df.shape[0]),(df['close']))
  # plt.xticks(range(0,df.shape[0],1951),df['date'].loc[::1951])
  # plt.xlabel('Date',fontsize=18)
  # plt.ylabel('Close Price',fontsize=18)
  # plt.show()

  prices = df['close'].to_numpy()

  test_data = prices
  # test_data = prices[1561:]

  print("Test: {}".format(test_data.shape[0]))

  test_data = test_data.reshape(-1,1)

  y_test = []
  x_test = []

  time_steps = 3

  for i in range(time_steps, test_data.shape[0]):
      x_test.append(test_data[i - time_steps:i])
      y_test.append(test_data[i, 0])

  x_test = np.array(x_test)
  y_test = np.array(y_test)
  
  y_predict = model.predict(x_test)

  plt.figure(figsize = (18,9))
  plt.plot(range(y_test.shape[0]),y_test,color='b',label='True')
  plt.plot(range(y_predict.shape[0]),y_predict,color='orange',label='Prediction')
  plt.xlabel(stock_name)
  plt.ylabel('Predict LSTM')
  plt.legend(fontsize=18)
  plt.show()


In [None]:
stock_list = ["BID", "BVH", "CTG", "FPT", "AGR"]
file_name_list = []

for s in stock_list:
  file_name_list.append("my-drive/MyDrive/" + s + "_2015.csv")


for i in range(0, len(file_name_list)):
  evaluate_another_stock(file_name_list[i], stock_list[i])
