<a href="https://colab.research.google.com/github/aufawibowo/lstm-under-different-stability-notebook/blob/master/Notebook_Implementation_of_Stock_Prediction_Based_on_LSTM_under_Different_Stability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [2]:
# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

# high-language sequential modeling
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras import optimizers

# misc
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from math import sqrt
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt

# statistic stationarity
from statsmodels.tsa.stattools import adfuller

# progress bar
from tqdm import tqdm
from tqdm.notebook import tnrange

Using TensorFlow backend.
  import pandas.util.testing as tm


In [3]:
class SourceFile:

    def __init__(self):
        self.source_file = ["https://raw.githubusercontent.com/aufawibowo/lstm-under-different-stability/master/dataset/SZSE%20200%20Historical%20Data.csv",
                "https://raw.githubusercontent.com/aufawibowo/lstm-under-different-stability/master/dataset/SZSE%20300%20Historical%20Data.csv",
                "https://raw.githubusercontent.com/aufawibowo/lstm-under-different-stability/master/dataset/Shanghai%20Shenzhen%20CSI%20300%20Historical%20Data.csv",
                 "https://raw.githubusercontent.com/aufawibowo/lstm-under-different-stability/master/dataset/Jakarta%20Stock%20Exchange%20Composite%20Index%20Historical%20Data.csv"]
    
        self.stock_name = ["SZSE 200",
                  "SZSE 300",
                  "CSI 300",
                  "JKSE"]

    def get_one_file(self, i):
        if isinstance(i, int):
            return self.source_file[i]
        else:
            return "Parameter i to index must be an integer"

    def get_one_stock_name(self, i):
        return self.stock_name[i]

    def get_all_file():
        return self.source_file

    def get_all_stock_name(self,i):
        return self.stock_name

In [4]:
# data log debugger
data_log = {}

In [5]:
def timeseries_to_supervised(data, lag=1):
	df = DataFrame(data)
	columns = [df.shift(i) for i in range(1, lag+1)]
	columns.append(df)
	df = concat(columns, axis=1)
	df = df.drop(0)
	return df

In [6]:
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return Series(diff)

In [7]:
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

In [8]:
def scale(train, test):
	# fit scaler
	scaler = MinMaxScaler(feature_range=(0, 1))
	scaler = scaler.fit(train)
	train = train.reshape(train.shape[0], train.shape[1])
	train_scaled = scaler.transform(train)
	test = test.reshape(test.shape[0], test.shape[1])
	test_scaled = scaler.transform(test)
	return scaler, train_scaled, test_scaled

In [9]:
def invert_scale(scaler, X, yhat):
	new_row = [x for x in X] + [yhat]
	array = np.array(new_row)
	array = array.reshape(1, len(array))
	inverted = scaler.inverse_transform(array)
	return inverted[0, -1]

In [10]:
# evaluate the model on a dataset, returns RMSE in transformed units
def evaluate(model, raw_data, scaled_dataset, scaler, offset, batch_size):
    # separate
    X, y = scaled_dataset[:,0:-1], scaled_dataset[:,-1]
    # reshape
    reshaped = X.reshape(len(X), 1, 1)
    # forecast dataset
    output = model.predict(reshaped, batch_size=batch_size)
    # invert data transforms on forecast
    predictions = list()
    for i in range(len(output)):
        yhat = output[i,0]
        # invert scaling
        yhat = invert_scale(scaler, X[i], yhat)
        # invert differencing
        yhat = yhat + raw_data[i]
        # store forecast
        predictions.append(yhat)
    # report performance
    rmse = sqrt(mean_squared_error(raw_data[1:], predictions))
    mae = mean_absolute_error(raw_data[1:], predictions)
    r_squared = r2_score(raw_data[1:], predictions)
    # pd_test = DataFrame(raw_data[1:])
    # pd_test.to_csv('data_test.csv')
    # pd_predictions = DataFrame(predictions)
    # pd_predictions.to_csv('data_predictions.csv')

    return rmse, mae, r_squared

In [11]:
data_log.keys()

dict_keys([])

In [12]:
"""
Description of fit_lstm()
Trains and returns an LSTM model. 
Takes the training dataset in a supervised learning format, a batch size, a number of epochs, and a number of neurons.
"""
def fit_lstm(train, test, raw, scaler, batch_size, epochs, neurons, timesteps, repeatation):
    X, y = train[:, 0:-1], train[:, -1]
    X = X.reshape(X.shape[0], timesteps, X.shape[1])
    data_log['fit_lstm_X_reshaped'] = X

    # building model
    model = Sequential()
    model.add(LSTM(neurons, activation='tanh', batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', return_sequences=True, stateful=True))
    # model.add(LSTM(neurons, activation='tanh', stateful=True))
    model.add(Dense(1, activation='tanh'))
    sgd = optimizers.SGD(lr=0.000001)
    model.compile(loss='mean_squared_error', optimizer='sgd')

    train_rmse, test_rmse = list(), list()
    train_mae, test_mae = list(), list()
    train_r_squared, test_r_squared = list(), list()
    for i in tnrange(epochs, desc = 'Fit LSTM'):
        #fit model
        temp = model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
        data_log['history_' + str(repeatation) + str(i)] = temp

        # evaluate model on train data
        raw_train = raw[-(len(train)+len(test)+1):-len(test)]
        rmse, mae, r_squared = evaluate(model, raw_train, train, scaler, 0, batch_size)
        train_rmse.append(rmse)
        train_mae.append(mae)
        train_r_squared.append(r_squared)

        # evaluate model on test data
        raw_test = raw[-(len(test)+1):]
        rmse, mae, r_squared = evaluate(model, raw_test, test, scaler, 0, batch_size)
        test_rmse.append(rmse)
        test_mae.append(mae)
        test_r_squared.append(r_squared)

        model.reset_states()
    
    # logging pake dict global
    data_log['train_rmse_' + str(repeatation)] = train_rmse
    data_log['test_rmse_' + str(repeatation)] = test_rmse
    data_log['train_mae_' + str(repeatation)] = train_mae
    data_log['test_mae_' + str(repeatation)] = test_mae
    data_log['train_r_squared_' + str(repeatation)] = train_r_squared
    data_log['test_r_squared_' + str(repeatation)] = train_r_squared

    return model

In [13]:
def plot_1(series, predictions, file_name):
    pyplot.figure(figsize=(12,7))
    pyplot.plot(series['Price'], 'green', color='blue', label='LSTM Training Data')
    pyplot.plot(series.index[-300:], predictions, color='green', marker='o', linestyle='dashed', label='LSTM Predicted Price')
    pyplot.plot(series.index[-300:], series['Price'][-300:], color='red', label='Actual Price')
    pyplot.title(file_name)
    pyplot.xlabel('Dates')
    pyplot.ylabel('Prices')
    #plt.xticks(np.arange(0,len(series), 300), series.index[0:len(series):300])
    pyplot.legend()
    pyplot.savefig('LSTM_' + file_name + '_Full.png')

In [14]:
def plot_2(series, predictions, file_name):
    pyplot.figure(figsize=(12,7))
    pyplot.plot(series.index[-300:], predictions, color='green', marker='o', linestyle='dashed', label='LSTM Predicted Price')
    pyplot.plot(series.index[-300:], series['Price'][-300:], color='red', label='Actual Price')
    pyplot.xticks(np.arange(1486,len(series), 60), series['Price'][1486:len(series):60])
    pyplot.title(file_name)
    pyplot.xlabel('Dates')
    pyplot.ylabel('Prices')
    pyplot.legend()
    pyplot.savefig('LSTM_' + file_name[36:] + '_Prediction Result.png')

In [15]:
def do_trial(repeats, series, epochs, batch_size, neurons, timesteps, file_name):
    # get difference
    diff_values = difference(series['Price'].values, 1)
    data_log['diff_values'] = diff_values

    # convert to supervised 
    supervised = timeseries_to_supervised(diff_values, timesteps)
    data_log['supervised'] = supervised

    # pick values only
    supervised_values = supervised.values[timesteps:,:]
    data_log['supervised_values'] = supervised_values

    train, test = supervised_values[0:-300], supervised_values[-300:]

    # scale data
    scaler, train_scaled, test_scaled = scale(train, test)
    data_log['train_scaled'] = train_scaled
    data_log['test_scaled'] = test_scaled

    error_scores = list()
    for repeatation in tnrange(repeats, desc = 'Main Lane'):
        train_trimmed = train_scaled[2:, :]
        data_log['train_trimmed'] = train_trimmed

        raw_values = series['Price'].values
        lstm_model = fit_lstm(train_trimmed, test_scaled, raw_values, scaler, batch_size, epochs, neurons, timesteps, repeatation)
        
        train_reshaped = train_trimmed[:, 0].reshape(len(train_trimmed), 1, 1)
        data_log['train_reshaped'] = train_reshaped

        # train
        lstm_model.predict(train_reshaped, batch_size=batch_size)

        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        data_log['test_reshaped'] = test_reshaped

        # test
        output = lstm_model.predict(test_reshaped, batch_size=batch_size)
        data_log['output'] = output

        predictions = list()

        # convert prediction result to unscaled data
        for i in range(len(output)):
            yhat = output[i,0]
            X = test_scaled[i, 0:-1]
            yhat = invert_scale(scaler, X, yhat)
            yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
            predictions.append(yhat)

        # rmse after prediction
        rmse = sqrt(mean_squared_error(raw_values[-300:], predictions))
        print('%d) Test RMSE: %.3f' % (repeatation+1, rmse))
        error_scores.append(rmse)
    pyplot.savefig('Error convergence_' + file_name + '.png')
    data_log['error_scores'] = error_scores
    data_log['lstm_model_summary'] = lstm_model.summary()
    data_log['predictions'] = predictions

    return error_scores, predictions

In [16]:
def return_scaled_data(repeats, series, epochs, batch_size, neurons, timesteps):
    raw_values = series['Price'].values
    diff_values = difference(raw_values, 1)
    supervised = timeseries_to_supervised(diff_values, timesteps)
    supervised_values = supervised.values[timesteps:,:]
    train, test = supervised_values[0:-300], supervised_values[-300:]
    scaler, train_scaled, test_scaled = scale(train, test)

    return train_scaled, test_scaled

In [17]:
def stationarity_info(series, file_address):
    results = adfuller(series)
    print(file_address)   # google colab address only
    print('ADF Statistic: %f' % results[0])
    print('p-value: %f' % results[1])
    print('Critical Values:')
    values = []
    for key, value in results[4].items():
        print('\t%s: %.3f' % (key, value))
        values.append(value)
    if(results[0] > max(values)):
        print("Non-stationary and most likely random walk.")
    else:
        print("Stationary")
    print("")

In [18]:
# #kode untuk mencetak data scaled
# pyplot.figure(figsize=(12,7))
# pyplot.plot(train_scaled, 'green', color='blue', label=file_address[36:-4] + 'Train Scaled')
# pyplot.title(file_address[36:]+ 'Train Scaled')
# pyplot.xlabel('Dates')
# pyplot.ylabel('Prices')
# #plt.xticks(np.arange(0,len(series), 300), series.index[0:len(series):300])
# pyplot.legend()
# pyplot.savefig(file_address[36:] + '_train_scaled.png')

# pyplot.figure(figsize=(12,7))
# pyplot.plot(test_scaled, 'green', color='blue', label=file_address[36:-4] + 'Test Scaled')
# pyplot.title(file_address[36:]+ 'Test Scaled')
# pyplot.xlabel('Dates')
# pyplot.ylabel('Prices')
# #plt.xticks(np.arange(0,len(series), 300), series.index[0:len(series):300])
# pyplot.legend()
# pyplot.savefig(file_address[36:] + '_test_scaled.png')

In [19]:

file_object = SourceFile()
stock_array_index = 0
file_name = file_object.get_one_file(stock_array_index)
stock_name = file_object.get_one_stock_name(stock_array_index)

# for file_name in source_file:
series = read_csv(file_name, header=0, parse_dates=[0], index_col=0, squeeze=True)
series = series.head(1000)
repeats = 5
results = DataFrame()
epochs = 1000
batches = 5
neurons = 3
timesteps = 1

# get stationarity information
stationarity_info(series['Price'].values, stock_name)

# main lane
results[0], predictions = do_trial(repeats, series, epochs, batches, neurons, timesteps, stock_name)
print(results.describe())
results.boxplot()
pyplot.legend()
pyplot.show()

plot_1(series, predictions, stock_name)
plot_2(series, predictions, stock_name)

SZSE 200
ADF Statistic: -2.034669
p-value: 0.271582
Critical Values:
	1%: -3.437
	5%: -2.864
	10%: -2.568
Non-stationary and most likely random walk.



HBox(children=(FloatProgress(value=0.0, description='Main Lane', max=5.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


1) Test RMSE: 57.995


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


2) Test RMSE: 57.732


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


3) Test RMSE: 57.760


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


4) Test RMSE: 57.348


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


5) Test RMSE: 57.679

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (5, 3)                    60        
_________________________________________________________________
dense_5 (Dense)              (5, 1)                    4         
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________
               0
count   5.000000
mean   57.702808
std     0.232270
min    57.348127
25%    57.679057
50%    57.731661
75%    57.760334
max    57.994859


No handles with labels found to put in legend.


### Debugging

### Eval

#### RMSE

In [20]:
data_log['train_rmse_0'][-1]

122.43855381497244

In [21]:
results

Unnamed: 0,0
0,57.994859
1,57.731661
2,57.760334
3,57.348127
4,57.679057


In [22]:
import matplotlib.pyplot as plt

# Fixing random state for reproducibility
np.random.seed(19680801)

# fake up some data
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low))

fig1, ax1 = plt.subplots()
ax1.set_title('Basic Plot')
ax1.boxplot(data)
fig1.savefig('matplotlib.png')

In [23]:
spread

array([70.03673039, 74.27508094, 70.92800107, 56.67455225, 97.77853328,
       70.6334846 , 24.79157587, 15.78833509, 69.76985214, 71.99566667,
       25.77444303, 34.15467831, 96.87611677, 69.45070978, 46.63832593,
       70.28126954, 51.1785874 , 92.874137  , 73.97692989, 62.24390337,
       65.15454689, 39.68076115, 54.32393949, 79.98995282, 72.15447266,
       29.53639811, 16.09458806, 20.61255148, 13.43253868, 48.06050174,
       34.25218134, 36.29692901, 97.29176387, 11.09436116, 38.82640891,
       78.30658753, 97.28972601, 48.32096053, 33.64211054, 56.74190362,
        4.79415097, 38.8937029 , 90.63036451, 16.10182093, 74.36211347,
       63.29741618, 32.41800177, 92.23765324, 23.72264387, 82.39455709])

In [24]:
df_train_rmse_list

NameError: ignored

In [None]:
train_rmse_list = list()
test_rmse_list = list()
for i in range(repeats):
    train_rmse_list.append(data_log['train_rmse_' + str(i)])
    test_rmse_list.append(data_log['test_rmse_' + str(i)])

#line plot
train_rmse_pyplot = pyplot.figure(figsize=(12,7))
test_rmse_pyplot = pyplot.figure(figsize=(12,7))

train_rmse_pyplot = pyplot.plot(train_rmse_list, color='blue', label='Train RMSE')
test_rmse_pyplot = pyplot.plot(test_rmse_list, color='orange', label='Test RMSE')

train_rmse_pyplot.savefig('RMSE_train_plot_overtime.png')
test_rmse_pyplot.savefig('RMSE_test_plot_overtime.png')

#box plot
train_rmse_boxplot_fig = pyplot.figure(figsize=(8,8))
test_rmse_boxplot_fig = pyplot.figure(figsize=(8,8))

pyplot.boxplot([x[-1] for x in train_rmse_list])
pyplot.boxplot([x[-1] for x in test_rmse_list])

train_rmse_boxplot_fig.savefig('RMSE_train_boxplot.png')
test_rmse_boxplot_fig.savefig('RMSE_test_boxplot.png')

In [None]:
len(train_rmse_list)

#### MAE

In [None]:
train_mae_list = list()
test_mae_list = list()

for i in range(repeats):
    train_mae_list.append(data_log['train_mae_' + str(i)])
    test_mae_list.append(data_log['test_mae_' + str(i)])

#line plot
train_mae_list = pyplot.figure(figsize=(12,7))
test_mae_list = pyplot.figure(figsize=(12,7))

train_mae_list.plot(train_rmse_list, color='blue', label='Train MAE')
test_mae_list.plot(test_rmse_list, color='orange', label='Test MAE')

train_mae_pyplot.savefig('MAE_train_plot_overtime.png')
test_mae_pyplot.savefig('MAE_test_plot_overtime.png')

#box plot
train_mae_boxplot_fig = pyplot.figure(figsize=(8,8))
test_mae_boxplot_fig = pyplot.figure(figsize=(8,8))

train_mae_boxplot_fig.boxplot(train_mae_list)
test_mae_boxplot_fig.boxplot(test_mae_list)

train_mae_boxplot_fig.savefig('MAE_train_boxplot.png')
test_mae_boxplot_fig.savefig('MAE_test_boxplot.png')

#### R2

In [None]:
train_r2_list = list()
test_r2_list = list()

for i in range(repeats):
    train_mae_list.append(data_log['train_mae_' + str(i)])
    test_mae_list.append(data_log['test_mae_' + str(i)])

train_r2_pyplot = pyplot.figure(figsize=(12,7))
test_r2_pyplot = pyplot.figure(figsize=(12,7))
train_r2_pyplot.plot(train_r2_list, color='blue', label='Train R2')
test_r2_pyplot.plot(test_r2_list, color='orange', label='Test R2')

train_r2_pyplot.savefig('R2_train_plot_overtime.png')
test_r2_pyplot.savefig('R2_test_plot_overtime.png')

#box plot
train_r2_boxplot_fig = pyplot.figure(figsize=(8,8))
test_r2_boxplot_fig = pyplot.figure(figsize=(8,8))

train_r2_boxplot_fig.boxplot(train_r2_list)
test_r2_boxplot_fig.boxplot(test_r2_list)

train_r2_boxplot_fig.savefig('R2_train_boxplot.png')
test_r2_boxplot_fig.savefig('R2_test_boxplot.png')

In [None]:
data_log.keys()

In [None]:
data_log['history_0994'].history.keys()
# data_log['history_0994'].history['val_loss']

In [None]:
history_loss_list = list()

for i in range(repeats):
    for j in range(1000):
        history_loss_list.append(data_log['history_' + str(i) + str(j)].history['loss'])
        # history_loss_list.append(data_log['history_' + str(i) + str(j)].history['val_loss'])


pyplot.figure(figsize=(12,7))
pyplot.plot(history_loss_list, color='blue', label='Loss')
# pyplot.plot(test_r_squared_list, color='orange', label='Test R2')
pyplot.savefig('loss_plot_overtime.png')
pyplot.legend()
pyplot.show()

In [None]:


pyplot.figure(figsize=(12,7))
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.savefig('History loss.png')
pyplot.legend()
pyplot.show()