In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, LSTM, Dropout, Layer, Dense, BatchNormalization, Bidirectional, GRU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

2025-11-20 17:36:38.225001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-20 17:36:38.272230: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-20 17:36:39.866489: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
#Read data from "preprocessing.csv"
data = pd.read_csv("Preprocessed.csv")

data = data[['Open', 'High', 'Low', 'Close', 'Volume']]

In [3]:
# Separate scalers for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Normalize the feature columns
data[['Open', 'High', 'Low', 'Volume']] = feature_scaler.fit_transform(
    data[['Open', 'High', 'Low', 'Volume']]
)

# Normalize the target column
data['Close'] = target_scaler.fit_transform(data[['Close']])

In [4]:
from joblib import Parallel, delayed

def create_sequences(df, seq_length, forecast_length, n_jobs=-1):
    def process(i):
        seq = df.iloc[i:i + seq_length].copy()
        target = df['Close'].iloc[i + seq_length:i + seq_length + forecast_length].values
        return seq, target

    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(process)(i)
        for i in range(len(df) - seq_length - forecast_length)
    )
    return results

# Usage
SEQ_LENGTH = 60
FORECAST_LENGTH = 5
sequences = create_sequences(data, SEQ_LENGTH, FORECAST_LENGTH)

# Split into X and y
X, y = zip(*sequences)

X = np.array([
    seq[['Open', 'High', 'Low', 'Close', 'Volume']].values
    for seq in X
])

y = np.array(y)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)

In [6]:
del data, sequences, X, y

In [7]:
# Define the Attention layer
@tf.keras.utils.register_keras_serializable()
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        e = K.squeeze(e, axis=-1)
        alpha = K.softmax(e)
        alpha = K.expand_dims(alpha, axis=-1)
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [8]:
@tf.keras.utils.register_keras_serializable()
class MultiHeadSelfAttention(Layer):
    def __init__(self, num_heads=4, head_dim=32, dropout_rate=0.1, use_causal_mask=True, **kwargs):
        """
        Args:
            num_heads: Number of attention heads.
            head_dim: Dimension of each head (proj_dim = num_heads * head_dim).
            dropout_rate: Dropout probability.
            use_causal_mask: If True, applies a look-ahead mask (critical for forecasting).
        """
        super(MultiHeadSelfAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.proj_dim = num_heads * head_dim
        self.dropout_rate = dropout_rate
        self.use_causal_mask = use_causal_mask

    def build(self, input_shape):
        feature_dim = input_shape[-1]

        # Linear Projections for Query, Key, Value
        self.Wq = self.add_weight(name="Wq", shape=(feature_dim, self.proj_dim), 
                                  initializer="glorot_uniform")
        self.Wk = self.add_weight(name="Wk", shape=(feature_dim, self.proj_dim), 
                                  initializer="glorot_uniform")
        self.Wv = self.add_weight(name="Wv", shape=(feature_dim, self.proj_dim), 
                                  initializer="glorot_uniform")

        # Output Projection
        self.dense = Dense(feature_dim)
        
        # Dropout Layers
        self.att_dropout = Dropout(self.dropout_rate)
        self.output_dropout = Dropout(self.dropout_rate)

        super(MultiHeadSelfAttention, self).build(input_shape)

    def split_heads(self, x, batch_size):
        # Reshape to (Batch, Seq_Len, Num_Heads, Head_Dim)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
        # Transpose to (Batch, Num_Heads, Seq_Len, Head_Dim)
        return tf.transpose(x, perm=(0, 2, 1, 3))

    def call(self, x, training=False):
        batch_size = tf.shape(x)[0]
        seq_len = tf.shape(x)[1]

        # 1. Project and Split Heads
        Q = self.split_heads(tf.matmul(x, self.Wq), batch_size)
        K = self.split_heads(tf.matmul(x, self.Wk), batch_size)
        V = self.split_heads(tf.matmul(x, self.Wv), batch_size)

        # 2. Scaled Dot-Product Attention
        # Shape: (Batch, Heads, Seq_Len, Seq_Len)
        score = tf.matmul(Q, K, transpose_b=True)
        
        # Scale scores to stabilize gradients
        scale = tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
        score = score / scale

        # 3. Apply Causal Mask (Look-ahead Mask)
        if self.use_causal_mask:
            # Create a lower triangular matrix of ones (1s in past/present, 0s in future)
            ones = tf.ones((seq_len, seq_len))
            mask = tf.linalg.band_part(ones, -1, 0) # Keep lower triangle
            
            # Invert: 0s in past, 1s in future
            mask = 1.0 - mask
            
            # Add huge negative number to future positions so Softmax makes them 0
            # Shape broadcasting: (1, 1, Seq, Seq)
            mask = mask[tf.newaxis, tf.newaxis, :, :]
            score += (mask * -1e9)

        # 4. Softmax & Dropout
        weights = tf.nn.softmax(score, axis=-1)
        if training:
            weights = self.att_dropout(weights, training=training)

        # 5. Weighted Sum of Values
        attention_output = tf.matmul(weights, V)
        
        # 6. Concatenate Heads
        # Transpose back to (Batch, Seq_Len, Num_Heads, Head_Dim)
        attention_output = tf.transpose(attention_output, perm=(0, 2, 1, 3))
        # Flatten to (Batch, Seq_Len, Proj_Dim)
        concat = tf.reshape(attention_output, (batch_size, -1, self.proj_dim))

        # 7. Final Projection
        output = self.dense(concat)
        if training:
            output = self.output_dropout(output, training=training)

        return output[:, -1, :]

    def get_config(self):
        config = super().get_config()
        config.update({
            "num_heads": self.num_heads,
            "head_dim": self.head_dim,
            "dropout_rate": self.dropout_rate,
            "use_causal_mask": self.use_causal_mask
        })
        return config

In [9]:
def cnn_block(x):
    for _ in range(3):
        x = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
        x = Dropout(0.3)(x)
    return x

def tcn_block(x, filters=64, kernel_size=3, dilations=[1, 2, 4, 8]):
    for dilation in dilations:
        res = x
        x = Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            padding="causal",
            dilation_rate=dilation,
            activation="relu"
        )(x)
        x = Dropout(0.25)(x)

        x = Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            padding="causal",
            dilation_rate=dilation,
            activation="relu"
        )(x)

        # Residual connection
        if res.shape[-1] != x.shape[-1]:
            res = Conv1D(filters, kernel_size=1)(res)

        x = x + res
        x = Dropout(0.25)(x)
    return x

In [11]:
def build_lstm_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)

    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = LSTM(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = LSTM(32, return_sequences=True)(x)
    x = Dropout(0.3)(x)

    attention = Attention()(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
    return model

model = build_lstm_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('lstm_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 159ms/step - loss: 0.0021 - mean_absolute_error: 0.0247 - root_mean_squared_error: 0.0461 - val_loss: 0.0116 - val_mean_absolute_error: 0.1056 - val_root_mean_squared_error: 0.1075 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 153ms/step - loss: 4.9671e-04 - mean_absolute_error: 0.0162 - root_mean_squared_error: 0.0223 - val_loss: 0.0082 - val_mean_absolute_error: 0.0892 - val_root_mean_squared_error: 0.0907 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 153ms/step - loss: 3.7803e-04 - mean_absolute_error: 0.0143 - root_mean_squared_error: 0.0194 - val_loss: 0.0053 - val_mean_absolute_error: 0.0717 - val_root_mean_squared_error: 0.0729 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 153ms/step - loss: 2.9607e-04 - mean_absolute_err



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - loss: 0.0022 - mean_absolute_error: 0.0440 - root_mean_squared_error: 0.0474
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - loss: 0.0022 - mean_absolute_error: 0.0440 - root_mean_squared_error: 0.0474
Validation MAE: 0.04397391900420189
Validation RMSE: 0.047369834035634995


In [None]:
from tensorflow.keras.layers import Bidirectional

# Build the model
def build_bilstm_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)

    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = Dropout(0.3)(x)

    attention = Attention()(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
    return model

model = build_bilstm_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('bilstm_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 181ms/step - loss: 0.0015 - mean_absolute_error: 0.0213 - root_mean_squared_error: 0.0385 - val_loss: 0.0317 - val_mean_absolute_error: 0.1765 - val_root_mean_squared_error: 0.1781 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 181ms/step - loss: 3.9948e-04 - mean_absolute_error: 0.0146 - root_mean_squared_error: 0.0200 - val_loss: 0.0170 - val_mean_absolute_error: 0.1295 - val_root_mean_squared_error: 0.1302 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 182ms/step - loss: 2.8921e-04 - mean_absolute_error: 0.0126 - root_mean_squared_error: 0.0170 - val_loss: 0.0095 - val_mean_absolute_error: 0.0960 - val_root_mean_squared_error: 0.0974 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 182ms/step - loss: 2.2991e-04 - mean_absolute_err



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - loss: 0.0029 - mean_absolute_error: 0.0454 - root_mean_squared_error: 0.0543
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 38ms/step - loss: 0.0029 - mean_absolute_error: 0.0454 - root_mean_squared_error: 0.0543
Validation MAE: 0.04538332670927048
Validation RMSE: 0.05430346727371216


In [None]:
from tensorflow.keras.layers import GRU

def create_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)                            

    # Replace LSTM with Bidirectional GRU layers
    x = GRU(128, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = GRU(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = GRU(32, return_sequences=True)(x)
    x = Dropout(0.3)(x)

    # Attention layer
    attention = Attention()(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])
    return model

model = create_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('gru_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 147ms/step - loss: 0.0024 - mean_absolute_error: 0.0271 - root_mean_squared_error: 0.0489 - val_loss: 0.0269 - val_mean_absolute_error: 0.1614 - val_root_mean_squared_error: 0.1640 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 144ms/step - loss: 5.8516e-04 - mean_absolute_error: 0.0181 - root_mean_squared_error: 0.0242 - val_loss: 0.0288 - val_mean_absolute_error: 0.1658 - val_root_mean_squared_error: 0.1696 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 144ms/step - loss: 4.5220e-04 - mean_absolute_error: 0.0162 - root_mean_squared_error: 0.0213 - val_loss: 0.0286 - val_mean_absolute_error: 0.1650 - val_root_mean_squared_error: 0.1691 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 144ms/step - loss: 3.8390e-04 - mean_absolute_err



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - loss: 0.0033 - mean_absolute_error: 0.0556 - root_mean_squared_error: 0.0571
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - loss: 0.0033 - mean_absolute_error: 0.0556 - root_mean_squared_error: 0.0571
Validation MAE: 0.05564403161406517
Validation RMSE: 0.05707588419318199


In [21]:
def create_tcn_model(input_shape):
    inputs = Input(shape=input_shape)

    x = cnn_block(inputs)

    # ---- TCN Block (stacked dilated convolutions + residuals) ----
    x = tcn_block(
        x,
        filters=64,
        kernel_size=3,
        dilations=[1, 2, 4, 8, 16]   # very important for TCN!
    )

    # ---- second TCN stage (deeper model) ----
    x = tcn_block(
        x,
        filters=32,
        kernel_size=3,
        dilations=[1, 2, 4, 8]
    )

    # ---- Attention Layer ----
    attention_out = Attention()(x)

    # ---- Output Layer (forecast next N timesteps) ----
    outputs = Dense(FORECAST_LENGTH)(attention_out)

    model = Model(inputs, outputs)

    model.compile(
        optimizer="adam",
        loss="mse",
        metrics=[
            tf.keras.metrics.MeanAbsoluteError(),
            tf.keras.metrics.RootMeanSquaredError()
        ]
    )

    return model


# Build model
model = create_tcn_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=50,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=1e-6
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, reduce_lr]
)

# Save
model.save('tcn_model.h5')

# Evaluate
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 129ms/step - loss: 0.0098 - mean_absolute_error: 0.0503 - root_mean_squared_error: 0.0990 - val_loss: 0.3625 - val_mean_absolute_error: 0.5955 - val_root_mean_squared_error: 0.6021 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 120ms/step - loss: 0.0019 - mean_absolute_error: 0.0317 - root_mean_squared_error: 0.0433 - val_loss: 0.3397 - val_mean_absolute_error: 0.5781 - val_root_mean_squared_error: 0.5829 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 119ms/step - loss: 0.0012 - mean_absolute_error: 0.0262 - root_mean_squared_error: 0.0349 - val_loss: 0.2520 - val_mean_absolute_error: 0.4997 - val_root_mean_squared_error: 0.5020 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 121ms/step - loss: 9.3700e-04 - mean_absolute_error: 0.02



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - loss: 0.0208 - mean_absolute_error: 0.1430 - root_mean_squared_error: 0.1442
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - loss: 0.0208 - mean_absolute_error: 0.1430 - root_mean_squared_error: 0.1442
Validation MAE: 0.1429571807384491
Validation RMSE: 0.14415742456912994


In [None]:
def build_lstm_mhsa_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)

    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = LSTM(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = LSTM(32, return_sequences=True)(x)
    x = Dropout(0.3)(x)

    attention = MultiHeadSelfAttention(
        num_heads=4,
        head_dim=32,
        dropout_rate=0.1,
        use_causal_mask=True  # Ensures no future peeking
    )(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse",
                  metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])

    return model

model = build_lstm_mhsa_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('lstm_mhsa_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 171ms/step - loss: 0.0058 - mean_absolute_error: 0.0482 - root_mean_squared_error: 0.0759 - val_loss: 0.0214 - val_mean_absolute_error: 0.1423 - val_root_mean_squared_error: 0.1463 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 170ms/step - loss: 0.0016 - mean_absolute_error: 0.0299 - root_mean_squared_error: 0.0404 - val_loss: 0.0233 - val_mean_absolute_error: 0.1487 - val_root_mean_squared_error: 0.1526 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 170ms/step - loss: 0.0011 - mean_absolute_error: 0.0251 - root_mean_squared_error: 0.0328 - val_loss: 0.0236 - val_mean_absolute_error: 0.1497 - val_root_mean_squared_error: 0.1537 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 169ms/step - loss: 8.2156e-04 - mean_absolute_error: 0.02



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step - loss: 0.0036 - mean_absolute_error: 0.0543 - root_mean_squared_error: 0.0604
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step - loss: 0.0036 - mean_absolute_error: 0.0543 - root_mean_squared_error: 0.0604
Validation MAE: 0.05427715182304382
Validation RMSE: 0.06040828302502632


In [None]:

def build_bilstm_mhsa_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)

    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = Dropout(0.3)(x)

    attention = MultiHeadSelfAttention(
        num_heads=4,
        head_dim=32,
        dropout_rate=0.1,
        use_causal_mask=True  # Ensures no future peeking
    )(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse",
                  metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])

    return model

model = build_bilstm_mhsa_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('bilstm_mhsa_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 220ms/step - loss: 0.0035 - mean_absolute_error: 0.0383 - root_mean_squared_error: 0.0594 - val_loss: 0.0493 - val_mean_absolute_error: 0.2187 - val_root_mean_squared_error: 0.2219 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 213ms/step - loss: 0.0012 - mean_absolute_error: 0.0259 - root_mean_squared_error: 0.0348 - val_loss: 0.0513 - val_mean_absolute_error: 0.2225 - val_root_mean_squared_error: 0.2264 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 216ms/step - loss: 8.5900e-04 - mean_absolute_error: 0.0225 - root_mean_squared_error: 0.0293 - val_loss: 0.0362 - val_mean_absolute_error: 0.1875 - val_root_mean_squared_error: 0.1902 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 207ms/step - loss: 6.9817e-04 - mean_absolute_error: 



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 43ms/step - loss: 0.0025 - mean_absolute_error: 0.0450 - root_mean_squared_error: 0.0503
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - loss: 0.0025 - mean_absolute_error: 0.0450 - root_mean_squared_error: 0.0503
Validation MAE: 0.045029956847429276
Validation RMSE: 0.050259072333574295


In [None]:
def build_gru_mhsa_model(input_shape):
    inputs = Input(shape=input_shape)
    x = cnn_block(inputs)

    # Replace LSTM with Bidirectional GRU layers
    x = GRU(128, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = GRU(64, return_sequences=True)(x)
    x = Dropout(0.3)(x)
    x = GRU(32, return_sequences=True)(x)
    x = Dropout(0.3)(x)

    attention = MultiHeadSelfAttention(
        num_heads=4,
        head_dim=32,
        dropout_rate=0.1,
        use_causal_mask=True  # Ensures no future peeking
    )(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse",
                  metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])

    return model

model = build_gru_mhsa_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('gru_mhsa_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 170ms/step - loss: 0.0048 - mean_absolute_error: 0.0460 - root_mean_squared_error: 0.0694 - val_loss: 0.0274 - val_mean_absolute_error: 0.1626 - val_root_mean_squared_error: 0.1654 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 180ms/step - loss: 0.0016 - mean_absolute_error: 0.0300 - root_mean_squared_error: 0.0403 - val_loss: 0.0397 - val_mean_absolute_error: 0.1947 - val_root_mean_squared_error: 0.1993 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 169ms/step - loss: 0.0010 - mean_absolute_error: 0.0247 - root_mean_squared_error: 0.0321 - val_loss: 0.0371 - val_mean_absolute_error: 0.1879 - val_root_mean_squared_error: 0.1927 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 169ms/step - loss: 8.5431e-04 - mean_absolute_error: 0.02



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - loss: 0.0040 - mean_absolute_error: 0.0616 - root_mean_squared_error: 0.0636
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - loss: 0.0040 - mean_absolute_error: 0.0616 - root_mean_squared_error: 0.0636
Validation MAE: 0.061557263135910034
Validation RMSE: 0.06361231207847595


In [10]:
def build_tcn_mhsa_model(input_shape):
    inputs = Input(shape=input_shape)
    
    x = cnn_block(inputs)

    # ---- TCN Block (stacked dilated convolutions + residuals) ----
    x = tcn_block(
        x,
        filters=64,
        kernel_size=3,
        dilations=[1, 2, 4, 8, 16]   # very important for TCN!
    )

    # ---- second TCN stage (deeper model) ----
    x = tcn_block(
        x,
        filters=32,
        kernel_size=3,
        dilations=[1, 2, 4, 8]
    )

    attention = MultiHeadSelfAttention(
        num_heads=4,
        head_dim=32,
        dropout_rate=0.1,
        use_causal_mask=True  # Ensures no future peeking
    )(x)

    outputs = Dense(FORECAST_LENGTH)(attention)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse",
                  metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()])

    return model

model = build_tcn_mhsa_model((SEQ_LENGTH, 5))
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=200,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.001
)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])

# Save the trained model
model.save('tcn_mhsa_model.h5')

# Evaluate the model
mae = model.evaluate(X_val, y_val)[1]
rmse = model.evaluate(X_val, y_val)[2]
print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")

2025-11-20 17:37:11.572776: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 130ms/step - loss: 0.0095 - mean_absolute_error: 0.0592 - root_mean_squared_error: 0.0973 - val_loss: 0.4554 - val_mean_absolute_error: 0.6680 - val_root_mean_squared_error: 0.6748 - learning_rate: 0.0010
Epoch 2/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 132ms/step - loss: 0.0031 - mean_absolute_error: 0.0409 - root_mean_squared_error: 0.0558 - val_loss: 0.4301 - val_mean_absolute_error: 0.6512 - val_root_mean_squared_error: 0.6558 - learning_rate: 0.0010
Epoch 3/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 130ms/step - loss: 0.0019 - mean_absolute_error: 0.0337 - root_mean_squared_error: 0.0439 - val_loss: 0.3293 - val_mean_absolute_error: 0.5717 - val_root_mean_squared_error: 0.5738 - learning_rate: 0.0010
Epoch 4/100
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 130ms/step - loss: 0.0015 - mean_absolute_error: 0.0301 -



[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - loss: 0.0853 - mean_absolute_error: 0.2909 - root_mean_squared_error: 0.2921
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - loss: 0.0853 - mean_absolute_error: 0.2909 - root_mean_squared_error: 0.2921
Validation MAE: 0.29087039828300476
Validation RMSE: 0.2921425700187683
