In [None]:
pip install transformers



In [None]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.2-cp310-cp310-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.2-cp310-cp310-win_amd64.whl (9.8 MB)
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.8 MB 5.2 MB/s eta 0:00:02
   - -------------------------------------- 0.5/9.8 MB 6.0 MB/s eta 0:00:02
   -- ------------------------------------- 0.7/9.8 MB 5.7 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/9.8 MB 5.7 MB/s eta 0:00:02
   ----- ---------------------------------- 1.3/9.8 MB 5.7 MB/s eta 0:00:02
   ------ --------------------------------- 1.6/9.8 MB 5.9 MB/s eta 0:00:02
   ------- -------------------------------- 1.9/9.8 MB 6.0 MB/s eta 0:00:02
   -------- ------------------------------- 2.1/9.8 MB 5.8 MB/s eta 0:00:02
   --------- ------------------------------ 2.4/9

### Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler

# Load the stock data
stock_symbol = 'AAPL'
stock_data = yf.download(stock_symbol, start='2000-01-01')
data = stock_data['Close'].values.reshape(-1, 1)

# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the dataset
def create_dataset(dataset, time_step=1):
    X, y = [], []
    for i in range(len(dataset) - time_step):
        a = dataset[i:(i + time_step), 0]
        X.append(a)
        y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(y)

time_step = 60
X, y = create_dataset(scaled_data, time_step)
X = X.reshape(X.shape[0], X.shape[1], 1)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


[*********************100%%**********************]  1 of 1 completed

X_train shape: (4900, 60, 1), y_train shape: (4900,)
X_test shape: (1225, 60, 1), y_test shape: (1225,)





### Auto-Regressor(ARIMA) architecture

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Train ARIMA model
arima_order = (5, 1, 0)  # Example order, should be tuned
arima_model = ARIMA(data, order=arima_order)
arima_result = arima_model.fit()

# Predict the next 1 day and 1 week
arima_forecast_1d = arima_result.forecast(steps=1)
arima_forecast_1w = arima_result.forecast(steps=7)

print("ARIMA 1-Day Forecast:", arima_forecast_1d)
print("ARIMA 1-Week Forecast:", arima_forecast_1w)


ARIMA 1-Day Forecast: [222.62306766]
ARIMA 1-Week Forecast: [222.62306766 222.61364004 222.6163137  222.56147578 222.65567424
 222.65363578 222.6522883 ]


### CNN-LSTM architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense

# Define the CNN-LSTM model
cnn_lstm_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(time_step, 1)),
    LSTM(50, activation='relu'),
    Dense(1)
])

cnn_lstm_model.compile(optimizer='adam', loss='mse')
cnn_lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Predict the next 1 day and 1 week
cnn_lstm_forecast_1d = cnn_lstm_model.predict(X_test[-1].reshape(1, time_step, 1))
cnn_lstm_forecast_1w = [cnn_lstm_model.predict(X_test[-(i + 1)].reshape(1, time_step, 1)) for i in range(7)]

# Inverse transform the predictions to get actual stock prices
cnn_lstm_forecast_1d = scaler.inverse_transform(cnn_lstm_forecast_1d)[0][0]
cnn_lstm_forecast_1w = [scaler.inverse_transform(f)[0][0] for f in cnn_lstm_forecast_1w]

print("CNN-LSTM 1-Day Forecast:", cnn_lstm_forecast_1d)
print("CNN-LSTM 1-Week Forecast:", cnn_lstm_forecast_1w)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - loss: 0.0012 - val_loss: 0.0049
Epoch 2/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 7.7534e-06 - val_loss: 0.0032
Epoch 3/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 7.2491e-06 - val_loss: 0.0019
Epoch 4/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 8.2295e-06 - val_loss: 8.6307e-04
Epoch 5/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 7.4003e-06 - val_loss: 5.1653e-04
Epoch 6/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 7.6365e-06 - val_loss: 4.7466e-04
Epoch 7/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 7.8660e-06 - val_loss: 4.3104e-04
Epoch 8/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 7.0770e-06 - val_loss:

### Transformer Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential

# Define TransformerBlock layer
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Define Transformer model
def create_transformer_model(input_shape, num_heads, ff_dim, num_transformer_blocks, rate=0.1):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = TransformerBlock(input_shape[1], num_heads, ff_dim, rate)(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(rate)(x)
    x = Dense(20, activation="relu")(x)
    x = Dropout(rate)(x)
    outputs = Dense(1)(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Create and compile the Transformer model
transformer_model = create_transformer_model((time_step, 1), num_heads=2, ff_dim=32, num_transformer_blocks=2, rate=0.1)
transformer_model.compile(optimizer='adam', loss='mse')
transformer_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Predict the next 1 day and 1 week
transformer_forecast_1d = transformer_model.predict(X_test[-1].reshape(1, time_step, 1))
transformer_forecast_1w = []

# Rolling prediction for the next 1 week
current_input = X_test[-1].reshape(1, time_step, 1)
for _ in range(7):
    pred = transformer_model.predict(current_input)
    transformer_forecast_1w.append(pred[0, 0])
    # Reshape pred to match the shape of current_input
    pred = pred.reshape(1, 1, 1)
    # Update the input for the next prediction
    current_input = np.append(current_input[:, 1:, :], pred, axis=1)

# Inverse transform the predictions to get actual stock prices
transformer_forecast_1d = scaler.inverse_transform(transformer_forecast_1d)[0][0]
transformer_forecast_1w = scaler.inverse_transform(np.array(transformer_forecast_1w).reshape(-1, 1)).flatten()

print("Transformer 1-Day Forecast:", transformer_forecast_1d)
print("Transformer 1-Week Forecast:", transformer_forecast_1w)

Epoch 1/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 67ms/step - loss: 0.0058 - val_loss: 0.3294
Epoch 2/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0042 - val_loss: 0.3229
Epoch 3/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0041 - val_loss: 0.3250
Epoch 4/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0043 - val_loss: 0.3275
Epoch 5/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0042 - val_loss: 0.3288
Epoch 6/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0044 - val_loss: 0.3255
Epoch 7/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0042 - val_loss: 0.3292
Epoch 8/50
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0043 - val_loss: 0.3304
Epoch 9/50
[1m154/154[0m [32m━━━━━━

### Result and Analysis

In [None]:
# Print forecasts for each model
print("ARIMA 1-Day Forecast:", arima_forecast_1d)
print("ARIMA 1-Week Forecast:", arima_forecast_1w)

print("CNN-LSTM 1-Day Forecast:", cnn_lstm_forecast_1d)
print("CNN-LSTM 1-Week Forecast:", cnn_lstm_forecast_1w)

print("Transformer 1-Day Forecast:", transformer_forecast_1d)
print("Transformer 1-Week Forecast:", transformer_forecast_1w)


ARIMA 1-Day Forecast: [222.62306766]
ARIMA 1-Week Forecast: [222.62306766 222.61364004 222.6163137  222.56147578 222.65567424
 222.65363578 222.6522883 ]
CNN-LSTM 1-Day Forecast: 197.6275
CNN-LSTM 1-Week Forecast: [197.6275, 202.07445, 198.06718, 197.21194, 196.47119, 194.9531, 195.81776]
Transformer 1-Day Forecast: 14.684911
Transformer 1-Week Forecast: [14.684911 14.684911 14.684911 14.684911 14.684911 14.684911 14.684911]


## Fine-Tuning Hyperparameters to reduce the loss

### 1) Auto Regressor architecture finetuning

In [None]:
pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [None]:
import pmdarima as pm
from sklearn.metrics import mean_squared_error
import numpy as np
import yfinance as yf

# Load the stock data
stock_symbol = 'AAPL'
stock_data = yf.download(stock_symbol, start='2000-01-01')
data = stock_data['Close'].values

# Split data into training and testing sets
train_size = int(len(data) * 0.8)
train_data, test_data = data[:train_size], data[train_size:]

# Fit the ARIMA model with automatic hyperparameter tuning
model = pm.auto_arima(
    train_data,
    start_p=0,
    start_q=0,
    max_p=6,
    max_q=6,
    seasonal=False,
    trace=True,
    error_action='ignore',
    suppress_warnings=True,
    stepwise=True
)

# Print the model summary
print(model.summary())

# Forecast the next 1 day and 1 week
forecast_1d = model.predict(n_periods=1)
forecast_1w = model.predict(n_periods=7)

print("Auto ARIMA 1-Day Forecast:", forecast_1d)
print("Auto ARIMA 1-Week Forecast:", forecast_1w)

# Evaluate the model on the test set
predictions = model.predict(n_periods=len(test_data))
mse = mean_squared_error(test_data, predictions)
print(f"Test MSE: {mse}")


[*********************100%%**********************]  1 of 1 completed


Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=3234.983, Time=1.54 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=3236.884, Time=0.60 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=3236.876, Time=1.01 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=3237.549, Time=0.27 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=3238.029, Time=3.88 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0] intercept
Total fit time: 7.326 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 4948
Model:               SARIMAX(0, 1, 0)   Log Likelihood               -1615.491
Date:                Sat, 03 Aug 2024   AIC                           3234.983
Time:                        09:25:06   BIC                           3247.996
Sample:                             0   HQIC                          3239.546
                               - 4948                                       

### 2) CNN - LSTM architecture finetuning

In [1]:
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense
import random

# Load the stock data
stock_symbol = 'AAPL'
stock_data = yf.download(stock_symbol, start='2000-01-01')
data = stock_data['Close'].values.reshape(-1, 1)

# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the dataset
def create_dataset(dataset, time_step=1):
    X, y = [], []
    for i in range(len(dataset) - time_step):
        X.append(dataset[i:(i + time_step), 0])
        y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(y)

time_step = 60
X, y = create_dataset(scaled_data, time_step)
X = X.reshape(X.shape[0], X.shape[1], 1)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Define the CNN-LSTM model
def create_cnn_lstm_model(filters=64, kernel_size=2, lstm_units=50, learning_rate=0.001):
    model = Sequential([
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', input_shape=(time_step, 1)),
        LSTM(lstm_units, activation='relu'),
        Dense(1)
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    return model

# Hyperparameter space
filter_space = [32, 64, 128]
kernel_size_space = [2, 3, 4]
lstm_units_space = [50, 100, 150]
learning_rate_space = [0.001, 0.01]
batch_size_space = [16, 32, 64]
epochs = 50

# Random search
n_iter = 10
best_mse = float("inf")
best_params = {}

for _ in range(n_iter):
    filters = random.choice(filter_space)
    kernel_size = random.choice(kernel_size_space)
    lstm_units = random.choice(lstm_units_space)
    learning_rate = random.choice(learning_rate_space)
    batch_size = random.choice(batch_size_space)

    print(f"Training model with filters={filters}, kernel_size={kernel_size}, lstm_units={lstm_units}, learning_rate={learning_rate}, batch_size={batch_size}")

    model = create_cnn_lstm_model(filters, kernel_size, lstm_units, learning_rate)
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)

    # Evaluate the model
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Model MSE: {mse}")

    if mse < best_mse:
        best_mse = mse
        best_params = {
            'filters': filters,
            'kernel_size': kernel_size,
            'lstm_units': lstm_units,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        }

print("Best CNN-LSTM params:", best_params)
print("Best CNN-LSTM MSE:", best_mse)


[*********************100%%**********************]  1 of 1 completed
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model with filters=128, kernel_size=3, lstm_units=100, learning_rate=0.001, batch_size=16
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
Model MSE: 0.00045404891630825743
Training model with filters=64, kernel_size=3, lstm_units=100, learning_rate=0.001, batch_size=64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Model MSE: 0.0009784592928676252
Training model with filters=64, kernel_size=4, lstm_units=50, learning_rate=0.001, batch_size=64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
Model MSE: 0.002165623752749002
Training model with filters=32, kernel_size=2, lstm_units=100, learning_rate=0.01, batch_size=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Model MSE: 53204.22666318027
Training model with filters=32, kernel_size=2, lstm_units=50, learning_rate=0.001, batch_size=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Model MSE: 0.00014497386190118832
Training model with filters=64, kernel_size=3, lstm_units=100, learning_rate=0.01, batch_size=64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Model MSE: 0.13731153311482888
Training model with filters=128, kernel_size=2, lstm_units=150, learning_rate=0.001, batch_size=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step
Model MSE: 0.0002202809265694602
Training model with filters=64, kernel_size=4, lstm_units=100, learning_rate=0.001, batch_size=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Model MSE: 0.0037175917258935247
Training model with filters=32, kernel_size=4, lstm_units=100, learning_rate=0.01, batch_size=16


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Model MSE: 0.00028741596078182586
Training model with filters=32, kernel_size=2, lstm_units=150, learning_rate=0.001, batch_size=64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step
Model MSE: 0.0010140587295140524
Best CNN-LSTM params: {'filters': 32, 'kernel_size': 2, 'lstm_units': 50, 'learning_rate': 0.001, 'batch_size': 32}
Best CNN-LSTM MSE: 0.00014497386190118832


### 3) Transformer architecture finetuning

In [None]:
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, MultiHeadAttention, GlobalAveragePooling1D
import random

# Load the stock data
stock_symbol = 'AAPL'
stock_data = yf.download(stock_symbol, start='2000-01-01')
data = stock_data['Close'].values.reshape(-1, 1)

# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the dataset
def create_dataset(dataset, time_step=1):
    X, y = [], []
    for i in range(len(dataset) - time_step):
        X.append(dataset[i:(i + time_step), 0])
        y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(y)

time_step = 60
X, y = create_dataset(scaled_data, time_step)
X = X.reshape(X.shape[0], X.shape[1], 1)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_transformer_model(input_shape, num_heads, ff_dim, num_transformer_blocks, rate=0.1):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = TransformerBlock(input_shape[1], num_heads, ff_dim, rate)(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(rate)(x)
    x = Dense(20, activation="relu")(x)
    x = Dropout(rate)(x)
    outputs = Dense(1)(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Hyperparameter space
num_heads_space = [2, 4, 6, 8]
ff_dim_space = [32, 64, 128]
num_transformer_blocks_space = [1, 2, 3]
dropout_rate_space = [0.1, 0.2, 0.3]
learning_rate_space = [0.001, 0.01]
batch_size_space = [16, 32, 64]
epochs = 50

# Random search
n_iter = 20
best_mse = float("inf")
best_params = {}

for _ in range(n_iter):
    num_heads = random.choice(num_heads_space)
    ff_dim = random.choice(ff_dim_space)
    num_transformer_blocks = random.choice(num_transformer_blocks_space)
    dropout_rate = random.choice(dropout_rate_space)
    learning_rate = random.choice(learning_rate_space)
    batch_size = random.choice(batch_size_space)

    print(f"Training model with num_heads={num_heads}, ff_dim={ff_dim}, num_transformer_blocks={num_transformer_blocks}, dropout_rate={dropout_rate}, learning_rate={learning_rate}, batch_size={batch_size}")

    model = create_transformer_model((time_step, 1), num_heads, ff_dim, num_transformer_blocks, dropout_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)

    # Evaluate the model
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Model MSE: {mse}")

    if mse < best_mse:
        best_mse = mse
        best_params = {
            'num_heads': num_heads,
            'ff_dim': ff_dim,
            'num_transformer_blocks': num_transformer_blocks,
            'dropout_rate': dropout_rate,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        }

print("Best Transformer params:", best_params)
print("Best Transformer MSE:", best_mse)


[*********************100%%**********************]  1 of 1 completed


Training model with num_heads=6, ff_dim=32, num_transformer_blocks=3, dropout_rate=0.2, learning_rate=0.01, batch_size=16
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step
Model MSE: 0.32335426839520254
Training model with num_heads=8, ff_dim=32, num_transformer_blocks=1, dropout_rate=0.2, learning_rate=0.01, batch_size=16
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Model MSE: 0.3241814895689793
Training model with num_heads=4, ff_dim=64, num_transformer_blocks=3, dropout_rate=0.3, learning_rate=0.001, batch_size=16
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step
Model MSE: 0.32016331441607493
Training model with num_heads=6, ff_dim=32, num_transformer_blocks=2, dropout_rate=0.1, learning_rate=0.001, batch_size=64
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
Model MSE: 0.3253481064813118
Training model with num_heads=4, ff_dim=32, num_transformer_blocks=1, dropout_rate=0.1, lear