In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
traffic_data = pd.read_csv('/content/drive/MyDrive/traffic predict/traffic_flow_data/flows/GD030A_S.csv')

## 1. Recover timestamp

In [None]:
# Define the recover_timestamp function
def recover_timestamp(data):
    # Combine 'date' and 'time' to form a datetime column
    data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time'].astype(str) + ':00', format='%Y-%m-%d %H:%M')

    # Set 'datetime' as index
    data = data.set_index('datetime')

    # Create a complete range of timestamps with hourly frequency
    full_time_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq='H')

    # Reindex the data to include all timestamps, filling missing rows with NaN
    data_full = data.reindex(full_time_range)

    return data_full

In [None]:
# Apply the recover_timestamp function to recover the full time series
traffic_full = recover_timestamp(traffic_data)
traffic_full

Unnamed: 0,date,time,flow
2019-10-01 00:00:00,2019-10-01,0.0,15.0
2019-10-01 01:00:00,2019-10-01,1.0,9.0
2019-10-01 02:00:00,2019-10-01,2.0,9.0
2019-10-01 03:00:00,2019-10-01,3.0,7.0
2019-10-01 04:00:00,2019-10-01,4.0,9.0
...,...,...,...
2023-09-30 19:00:00,2023-09-30,19.0,129.0
2023-09-30 20:00:00,2023-09-30,20.0,119.0
2023-09-30 21:00:00,2023-09-30,21.0,106.0
2023-09-30 22:00:00,2023-09-30,22.0,88.0


## 2. Train, validate, test data split

In [None]:
train_set = traffic_full[:'2022-02-28 23:00:00']
valid_set = traffic_full['2022-03-01 00:00:00':'2022-12-31 23:00:00']
test_set = traffic_full['2023-01-01 00:00:00':]
print('Proportion of train_set : {:.4f}'.format(len(train_set)/len(traffic_full)))
print('Proportion of valid_set : {:.4f}'.format(len(valid_set)/len(traffic_full)))
print('Proportion of test_set : {:.4f}'.format(len(test_set)/len(traffic_full)))

Proportion of train_set : 0.6037
Proportion of valid_set : 0.2094
Proportion of test_set : 0.1869


## 3. Split the data into X and y



In [None]:
# Define the create_multi_step_sequence function
def create_multi_step_sequence(data, last_n_steps, day_lag, week_lag, n_future_steps):
    """
    Create input sequences from data using multiple time windows and multiple future steps as output.

    Parameters:
    - data: The time series data (1D array or list with possible NaN values)
    - last_n_steps: Number of most recent steps to use as part of the input (default 12)
    - day_lag: Time lag for the last day's value (24 hours ago, default 24)
    - week_lag: Time lag for the last week's value (168 hours ago, default 168)
    - n_future_steps: Number of future steps to predict (default 6)

    Returns:
    - X: Input features combining last_n_steps, last day's value, and last week's value
    - y: Output labels (shape: [samples, n_future_steps])
    """
    X, y = [], []

    # Loop over the data to create the input-output pairs
    for i in range(max(last_n_steps, day_lag, week_lag), len(data) - n_future_steps):
        # Input sequence of the last `last_n_steps` observations
        input_seq = data['flow'].values[i - last_n_steps:i]

        # Last day's observation (24 hours ago)
        last_day_value = data['flow'].values[i - day_lag]

        # Last week's observation (168 hours ago)
        last_week_value = data['flow'].values[i - week_lag]

        # Output (next `n_future_steps` observations)
        output_seq = data['flow'].values[i:i + n_future_steps]

        # Check if any NaN values exist in the input or output sequences
        if not np.isnan(input_seq).any() and not np.isnan(last_day_value) and not np.isnan(last_week_value) and not np.isnan(output_seq).any():
            # Combine the features: last_n_steps, last_day_value, last_week_value
            X.append(np.concatenate([input_seq, [last_day_value], [last_week_value]]))
            y.append(output_seq)

    # Convert to numpy arrays and reshape X to match CNN expected input (samples, timesteps, features)
    X = np.array(X).reshape(-1, last_n_steps + 2, 1)  # Add the 2 additional features (last_day_value, last_week_value)
    y = np.array(y).reshape(-1, n_future_steps)  # Multiple output steps

    # Convert the entire input and output into a pandas DataFrame with appropriate column names for multi-step prediction

    # Define column names for the input DataFrame
    input_columns = [f'Step_{i}_back' for i in range(last_n_steps, 0, -1)] + ['Day_1_back', 'Week_1_back']

    # Create a DataFrame for the input (all rows)
    df_X = pd.DataFrame(X.reshape(X.shape[0], 14), columns=input_columns)

    # Create a DataFrame for the output (all rows), where each column represents one of the next 6 steps
    output_columns = [f'Next_Step_{i}' for i in range(0, n_future_steps)]
    df_y = pd.DataFrame(y, columns=output_columns)

    return X, y, df_X, df_y

# Code to predict next 6 steps step-by-step

#### We will use
* the last 12 steps

* previous one week (24 steps)

* previous one month  (168 steps)

*  to forecast current (0 step)

## 4. Create input and output data

In [None]:
# Create input-output sequences with the provided function
X_train, y_train, X_train_df, y_train_df = create_multi_step_sequence(train_set, last_n_steps=12, day_lag=24, week_lag=168, n_future_steps=1)
X_valid, y_valid, X_valid_df, y_valid_df = create_multi_step_sequence(valid_set, last_n_steps=12, day_lag=24, week_lag=168, n_future_steps=1)
X_test, y_test, X_test_df, y_test_df = create_multi_step_sequence(test_set, last_n_steps=12, day_lag=24, week_lag=168, n_future_steps=1)

In [None]:
X_train.shape, X_train, y_train.shape, y_train

((19570, 14, 1),
 array([[[173.],
         [168.],
         [155.],
         ...,
         [ 27.],
         [ 21.],
         [ 15.]],
 
        [[168.],
         [155.],
         [186.],
         ...,
         [  9.],
         [  8.],
         [  9.]],
 
        [[155.],
         [186.],
         [333.],
         ...,
         [ 10.],
         [ 10.],
         [  9.]],
 
        ...,
 
        [[141.],
         [142.],
         [107.],
         ...,
         [160.],
         [129.],
         [131.]],
 
        [[142.],
         [107.],
         [128.],
         ...,
         [ 94.],
         [ 87.],
         [ 77.]],
 
        [[107.],
         [128.],
         [150.],
         ...,
         [ 80.],
         [ 63.],
         [ 35.]]]),
 (19570, 1),
 array([[ 9.],
        [10.],
        [ 8.],
        ...,
        [94.],
        [80.],
        [63.]]))

## 5. Normalise the data after split (step-by-step)

Normalise X

In [None]:
# Separate scalers for inputs and outputs
x_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaler = MinMaxScaler(feature_range=(0, 1))

# Reshape x_train to 2D for scaling
n_samples, n_timesteps, n_features = X_train.shape
x_train_reshaped = X_train.reshape(-1, n_features)  # Shape: (n_samples * n_timesteps, n_features)
# Fit the scaler on the training data
x_scaler.fit(x_train_reshaped)
# Transform the training data
x_train_scaled = x_scaler.transform(x_train_reshaped)
# Reshape back to original shape
x_train_scaled = x_train_scaled.reshape(n_samples, n_timesteps, n_features)

# x_val
n_val_samples = X_valid.shape[0]
x_val_reshaped = X_valid.reshape(-1, n_features)
x_val_scaled = x_scaler.transform(x_val_reshaped)
x_val_scaled = x_val_scaled.reshape(n_val_samples, n_timesteps, n_features)

# x_test
n_test_samples = X_test.shape[0]
x_test_reshaped = X_test.reshape(-1, n_features)
x_test_scaled = x_scaler.transform(x_test_reshaped)
x_test_scaled = x_test_scaled.reshape(n_test_samples, n_timesteps, n_features)

Normalise y

In [None]:
# Reshape y_train to 2D for scaling
y_train_reshaped = y_train.reshape(-1, 1)  # Shape: (n_samples * n_outputs, 1)
# Fit the scaler on the training data
y_scaler.fit(y_train_reshaped)
# Transform the training data
y_train_scaled = y_scaler.transform(y_train_reshaped)
# Reshape back to original shape
y_train_scaled = y_train_scaled.reshape(n_samples, y_train.shape[1])

# y_val
y_val_reshaped = y_valid.reshape(-1, 1)
y_val_scaled = y_scaler.transform(y_val_reshaped)
y_val_scaled = y_val_scaled.reshape(n_val_samples, y_valid.shape[1])

# y_test
y_test_reshaped = y_test.reshape(-1, 1)
y_test_scaled = y_scaler.transform(y_test_reshaped)
y_test_scaled = y_test_scaled.reshape(n_test_samples, y_test.shape[1])

## 6. Build LSTM

In [None]:
def create_lstm_model(input_shape, units, dropout_rate, learning_rate):
    model = keras.Sequential()
    model.add(LSTM(units=units, activation='tanh', input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1)) # Output layer for one-step prediction
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        loss='mse',
        optimizer=optimizer,
        metrics=['mae']
    )
    return model

input_shape = (x_train_scaled.shape[1], x_train_scaled.shape[2])

## 7. Hyperparameter Tuning

In [None]:
# Hyperparameter options
units_list = [50, 100, 200]
dropout_rates = [0, 0.3, 0.5]
learning_rates = [0.01, 0.001, 0.0001]
batch_sizes = [32, 64, 128]

# Create all possible combinations
hyperparameter_combinations = list(product(units_list, dropout_rates, learning_rates, batch_sizes))

In [None]:
# Initialize variables to store the best model and hyperparameters
best_val_mae = np.inf
best_hyperparams = None
best_model = None

for idx, (units, dropout_rate, learning_rate, batch_size) in enumerate(hyperparameter_combinations):
    print(f"\nCombination {idx+1}/{len(hyperparameter_combinations)}")
    print(f"Training with units={units}, dropout_rate={dropout_rate}, learning_rate={learning_rate}, batch_size={batch_size}")

    # Create the LSTM model with the current hyperparameters
    model = create_lstm_model(
        input_shape=input_shape,
        units=units,
        dropout_rate=dropout_rate,
        learning_rate=learning_rate
    )

    # Initialize EarlyStopping
    early_stopping = EarlyStopping(
        monitor='val_mae',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )

    # Train the model
    history = model.fit(
        x_train_scaled, y_train_scaled,
        epochs=50,
        batch_size=batch_size,
        validation_data=(x_val_scaled, y_val_scaled),
        callbacks=[early_stopping],
        verbose=0
    )

    # Get the best validation MAE from this training run
    val_mae = min(history.history['val_mae'])
    print(f"Validation MAE: {val_mae:.4f}")

    # Update best model if current one is better
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        best_model = model
        best_hyperparams = {
            'units': units,
            'dropout_rate': dropout_rate,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        }

print("\nBest Hyperparameters:")
for param, value in best_hyperparams.items():
    print(f"{param}: {value}")
print(f"Best Validation MAE: {best_val_mae:.4f}")



Combination 1/81
Training with units=50, dropout_rate=0, learning_rate=0.01, batch_size=32
Epoch 32: early stopping
Restoring model weights from the end of the best epoch: 22.
Validation MAE: 0.0392

Combination 2/81
Training with units=50, dropout_rate=0, learning_rate=0.01, batch_size=64
Epoch 21: early stopping
Restoring model weights from the end of the best epoch: 11.
Validation MAE: 0.0395

Combination 3/81
Training with units=50, dropout_rate=0, learning_rate=0.01, batch_size=128
Epoch 27: early stopping
Restoring model weights from the end of the best epoch: 17.
Validation MAE: 0.0388

Combination 4/81
Training with units=50, dropout_rate=0, learning_rate=0.001, batch_size=32
Restoring model weights from the end of the best epoch: 49.
Validation MAE: 0.0394

Combination 5/81
Training with units=50, dropout_rate=0, learning_rate=0.001, batch_size=64
Restoring model weights from the end of the best epoch: 50.
Validation MAE: 0.0399

Combination 6/81
Training with units=50, dropo

## 8. Recursive Forecasting with LSTM (step-by-step)

In [None]:
# --- Recursive Forecasting with LSTM ---
def recursive_forecast_lstm(model, input_seq, n_steps, x_scaler, y_scaler):
    predictions = []
    current_input = input_seq.copy()  # Shape: (n_timesteps, n_features)

    for _ in range(n_steps):
        # Reshape to (1, n_timesteps, n_features)
        input_lstm = current_input.reshape((1, current_input.shape[0], current_input.shape[1]))

        # Predict the next time step (scaled with y_scaler)
        yhat_scaled = model.predict(input_lstm, verbose=0)  # Shape: (1, 1)

        # Inverse transform the prediction to original scale
        yhat = y_scaler.inverse_transform(yhat_scaled)  # Shape: (1, 1)

        # Append prediction to the list (original scale)
        predictions.append(yhat[0, 0])

        # Transform yhat to x_scaler scale for input
        yhat_scaled_for_input = x_scaler.transform(yhat)  # Shape: (1, n_features)

        # Update the input sequence
        current_input = np.vstack((current_input[1:], yhat_scaled_for_input))

    return predictions

## 9. Make step-by-step prediction

In [None]:
# Number of steps to predict
n_steps = 6  # Adjust as needed

# Initialize lists to store predictions and actual values
all_predictions = []
all_actuals = []

n_test_samples = x_test_scaled.shape[0]
for i in range(n_test_samples - n_steps):
    # Get the input sequence for the current sample
    input_seq = x_test_scaled[i]

    # Perform recursive forecasting
    predictions = recursive_forecast_lstm(
        model=best_model,
        input_seq=input_seq,
        n_steps=n_steps,
        x_scaler=x_scaler,
        y_scaler=y_scaler
    )

    # Get the actual future values (in original scale)
    actual_values = y_test[i+1:i + n_steps + 1].flatten()

    # Store the predictions and actual values
    all_predictions.append(predictions)
    all_actuals.append(actual_values)

# Convert lists to numpy arrays
all_predictions = np.array(all_predictions)
all_actuals = np.array(all_actuals)

## 10. Evaluating the LSTM Model

In [None]:
# Compute evaluation metrics
epsilon = 1e-10  # To avoid division by zero in MAPE

for i in range(n_steps):
    y_true = all_actuals[:, i]
    y_pred = all_predictions[:, i]

    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Mean Absolute Percentage Error (MAPE)
    y_true_safe = np.where(y_true == 0, epsilon, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100

    print(f"\nTime Step {i+1} Evaluation Metrics:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MAPE: {mape:.2f}%")


Time Step 1 Evaluation Metrics:
RMSE: 41.4157
MAE: 30.0013
MAPE: 34.19%

Time Step 2 Evaluation Metrics:
RMSE: 54.1550
MAE: 38.7374
MAPE: 46.66%

Time Step 3 Evaluation Metrics:
RMSE: 64.6594
MAE: 48.2622
MAPE: 69.03%

Time Step 4 Evaluation Metrics:
RMSE: 65.3465
MAE: 52.1432
MAPE: 74.46%

Time Step 5 Evaluation Metrics:
RMSE: 73.7398
MAE: 59.4891
MAPE: 95.24%

Time Step 6 Evaluation Metrics:
RMSE: 81.3698
MAE: 66.3956
MAPE: 119.14%
