In [2]:
# Imports
import pandas as pd
import numpy as np
import os
import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objs as go
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [5]:
# The aim of this project is to predict the future closing price of Google stock
# using historical stock price data. For this purpose, we implement and compare
# two different modeling approaches: an LSTM (Long Short-Term Memory) model and
# a Bayesian Linear Regression model.
#
# We focus on the "Close" price because it is often regarded as the most representative
# daily price in financial contexts and serves as a common benchmark in stock prediction tasks.
# Although other features such as Open, High, Low, and Volume could enrich the input,
# we intentionally simplify the problem for this baseline model by using only the closing price.
#
# The LSTM is used to capture complex nonlinear and temporal dependencies in the data,
# while Bayesian Linear Regression provides a probabilistic and interpretable baseline,
# allowing us to quantify uncertainty in the predictions and better understand model behavior.
df = pd.read_csv('../data/GOOGL_historical_data.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2004-08-20,2.51575,2.716741,2.503048,2.697563,456686856
1,2004-08-23,2.758334,2.826327,2.715994,2.724711,365122512
2,2004-08-24,2.770538,2.779504,2.579509,2.611887,304946748
3,2004-08-25,2.614129,2.689843,2.587231,2.640031,183772044
4,2004-08-26,2.613879,2.688597,2.606657,2.687601,141897960


In [6]:
# Generate interactive plots via plotly
trace1 = go.Scatter(
    x = df['Date'],
    y = df['Close'],
    mode = 'lines',
    name = 'Data'
)
layout = go.Layout(
    title = "Google Stock",
    xaxis = {'title' : "Date"},
    yaxis = {'title' : "Close"}
)
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [7]:
# === DATA ===
# Extract the "Close" price column and reshape it to a 2D array (required by scaler and models)
close_data = df['Close'].values.reshape(-1, 1)

# Normalize the data to a range between 0 and 1 using MinMaxScaler
scaler = MinMaxScaler()
close_data_scaled = scaler.fit_transform(close_data)

# Define a training/testing split ratio (e.g., 80% training)
split_percent = 0.8
split_index = int(len(close_data_scaled) * split_percent)

# Create training and testing datasets
close_train = close_data_scaled[:split_index]
close_test = close_data_scaled[split_index:]

# Extract corresponding dates (for plotting later)
date_train = df['Date'][:split_index]
date_test = df['Date'][split_index:]

print(len(close_train))
print(len(close_test))

4100
1025


In [9]:
# Defining lookback of 10d to use Keras' TimeSeriesGenerator to get the desired format
look_back = 10

# === LSTM TIMESERIES GENERATOR ===
# Goal: Convert the time series into input/output sequences.
# Each input is a window of "look_back" time steps; each output is the next value.

# Create a Keras TimeseriesGenerator for LSTM training and testing
train_generator = TimeseriesGenerator(close_train, close_train, length=look_back, batch_size=1)
test_generator = TimeseriesGenerator(close_test, close_test, length=look_back, batch_size=1)


In [11]:
# === LSTM MODEL (Load or Train) ===
# Goal: Predict future prices based on the last 10 observed values using an LSTM neural network.

# If a pre-trained model exists, load it from disk
if os.path.exists("lstm_model.h5"):
    model = load_model("lstm_model.h5", compile=False)
    print("✅ LSTM model loaded.")
else:
    # Define a new LSTM model
    model = Sequential()
    model.add(LSTM(10, activation='relu', input_shape=(look_back, 1)))  # LSTM layer with 10 units
    model.add(Dense(1))  # Single output neuron for regression
    model.compile(optimizer='adam', loss='mse')  # Use Mean Squared Error as loss
    model.fit(train_generator, epochs=100, verbose=1)  # Train the model on training data
    model.save("lstm_model.h5")  # Save the trained model
    print("✅ LSTM model trained and saved.")

✅ LSTM model loaded.


In [12]:
# === LSTM PREDICTION ===
# Predict on the test data and inverse-transform the values to the original scale
prediction_scaled = model.predict(test_generator)
prediction = scaler.inverse_transform(prediction_scaled.reshape(-1, 1)).reshape(-1)

# Also inverse-transform the actual test values for comparison
close_test_actual = scaler.inverse_transform(close_test).reshape(-1)

# Adjust for the fact that predictions start only after the look-back window
close_test_actual_adj = close_test_actual[look_back:]
date_test_lstm_adj = date_test[look_back:]

# Calculate Mean Squared Error for LSTM model
mse_lstm = mean_squared_error(close_test_actual_adj, prediction)
print(f"📉 Mean Squared Error (LSTM): {mse_lstm:.4f}")

[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step
📉 Mean Squared Error (LSTM): 7.4021


In [13]:
# === BAYESIAN REGRESSION ===
# Goal: Use a probabilistic linear regression model to predict the next closing price.

# Helper function to convert data into input/output format manually
def create_dataset(data, look_back):
    X, y = [], []
    for i in range(len(data) - look_back):
        X.append(data[i:i+look_back, 0])  # Input: sequence of past values
        y.append(data[i + look_back, 0])  # Output: next value
    return np.array(X), np.array(y)

# Prepare training and test data for regression models
X_train, y_train = create_dataset(close_train, look_back)
X_test, y_test = create_dataset(close_test, look_back)

# Initialize and train a Bayesian Ridge Regression model
bayesian_model = BayesianRidge()
bayesian_model.fit(X_train, y_train)

# Make predictions and inverse-transform the results
bayesian_pred_scaled = bayesian_model.predict(X_test)
bayesian_pred = scaler.inverse_transform(bayesian_pred_scaled.reshape(-1, 1)).reshape(-1)

# Inverse-transform the actual target values for comparison
y_test_actual = scaler.inverse_transform(y_test.reshape(-1, 1)).reshape(-1)
date_test_bayes_adj = date_test[look_back:]

# Calculate error for Bayesian Regression
mse_bayes = mean_squared_error(y_test_actual, bayesian_pred)
print(f"📉 Mean Squared Error (Bayesian Regression): {mse_bayes:.4f}")

📉 Mean Squared Error (Bayesian Regression): 6.1698


In [14]:
# === PLOTTING FUNCTION ===
# Goal: Visually compare the predicted values vs. the actual ones for each model.

def plot_predictions(title, dates, train_data, pred, truth):
    # Plot training data
    trace1 = go.Scatter(
        x=dates['train'],
        y=train_data,
        mode='lines',
        name='Train Data',
        line=dict(color='blue')
    )
    # Plot predicted values
    trace2 = go.Scatter(
        x=dates['test'],
        y=pred,
        mode='lines',
        name='Prediction',
        line=dict(color='red')
    )
    # Plot actual (ground truth) values
    trace3 = go.Scatter(
        x=dates['test'],
        y=truth,
        mode='lines',
        name='Ground Truth',
        line=dict(color='green')
    )
    # Define layout of the plot
    layout = go.Layout(
        title=title,
        xaxis={'title': 'Date'},
        yaxis={'title': 'Close Price'}
    )
    # Show interactive plot
    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    fig.show()

# === PREPARE DATA FOR PLOTTING ===
# Inverse-transform training data for plotting
train_plot_data = scaler.inverse_transform(close_train).reshape(-1)

# Plot LSTM results
plot_predictions("📊 Google Stock – LSTM",
                 {'train': date_train, 'test': date_test_lstm_adj},
                 train_plot_data, prediction, close_test_actual_adj)

# Plot Bayesian Regression results
plot_predictions("📊 Google Stock – Bayesian Regression",
                 {'train': date_train, 'test': date_test_bayes_adj},
                 train_plot_data, bayesian_pred, y_test_actual)

In [15]:
# === RESIDUAL PLOTTING FUNCTION ===
# Goal: Plot the prediction error (residuals = actual - predicted) over time.
#       Additionally, show mean error (ME) and mean absolute error (MAE) as visual references.

def plot_residuals(title, dates, actual, predicted):
    residuals = actual - predicted
    mean_error = np.mean(residuals)
    mae = mean_absolute_error(actual, predicted)

    print(f"📊 {title}")
    print(f"   • Mean Error (bias): {mean_error:.4f}")
    print(f"   • Mean Absolute Error (avg. deviation): {mae:.4f}")

    trace = go.Scatter(
        x=dates,
        y=residuals,
        mode='lines+markers',
        name='Residuals',
        line=dict(color='orange')
    )

    layout = go.Layout(
        title=title,
        xaxis={'title': 'Date'},
        yaxis={'title': 'Prediction Error (Actual - Predicted)'},
        shapes=[
    # Horizontal line at mean error (ME)
    dict(
        type='line',
        x0=min(dates),
        y0=mean_error,
        x1=max(dates),
        y1=mean_error,
        line=dict(color='red', dash='dot')
    ),
    # Horizontal line at +MAE
    dict(
        type='line',
        x0=min(dates),
        y0=mae,
        x1=max(dates),
        y1=mae,
        line=dict(color='blue', dash='dot')
    ),
    # Optional: Horizontal line at -MAE
    dict(
        type='line',
        x0=min(dates),
        y0=-mae,
        x1=max(dates),
        y1=-mae,
        line=dict(color='blue', dash='dot')
    )
]
    )

    fig = go.Figure(data=[trace], layout=layout)
    fig.show()

# === Residual plots for all models ===
plot_residuals("📉 Residuals – LSTM Model",
               date_test_lstm_adj, close_test_actual_adj, prediction)

plot_residuals("📉 Residuals – Bayesian Regression",
               date_test_bayes_adj, y_test_actual, bayesian_pred)

📊 📉 Residuals – LSTM Model
   • Mean Error (bias): 1.0255
   • Mean Absolute Error (avg. deviation): 2.0484


📊 📉 Residuals – Bayesian Regression
   • Mean Error (bias): 0.0212
   • Mean Absolute Error (avg. deviation): 1.8122
