In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [2]:
traffic_data = pd.read_csv('GD030A_S.csv')

## 1. Recover timestamp

In [3]:
# Define the recover_timestamp function
def recover_timestamp(data):
    # Combine 'date' and 'time' to form a datetime column
    data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time'].astype(str) + ':00', format='%Y-%m-%d %H:%M')

    # Set 'datetime' as index
    data = data.set_index('datetime')

    # Create a complete range of timestamps with hourly frequency
    full_time_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq='H')

    # Reindex the data to include all timestamps, filling missing rows with NaN
    data_full = data.reindex(full_time_range)

    return data_full

In [4]:
# Apply the recover_timestamp function to recover the full time series
traffic_full = recover_timestamp(traffic_data)
traffic_full

Unnamed: 0,date,time,flow
2019-10-01 00:00:00,2019-10-01,0.0,15.0
2019-10-01 01:00:00,2019-10-01,1.0,9.0
2019-10-01 02:00:00,2019-10-01,2.0,9.0
2019-10-01 03:00:00,2019-10-01,3.0,7.0
2019-10-01 04:00:00,2019-10-01,4.0,9.0
...,...,...,...
2023-09-30 19:00:00,2023-09-30,19.0,129.0
2023-09-30 20:00:00,2023-09-30,20.0,119.0
2023-09-30 21:00:00,2023-09-30,21.0,106.0
2023-09-30 22:00:00,2023-09-30,22.0,88.0


## 2. Train, validate, test data split

In [5]:
# train_set = traffic_full[:'2022-02-28 23:00:00']
# valid_set = traffic_full['2022-03-01 00:00:00':'2022-12-31 23:00:00']
# test_set = traffic_full['2023-01-01 00:00:00':]
train_set = traffic_full['2022-06-03 00:00:00':'2023-03-31 23:00:00']
valid_set = traffic_full['2023-04-01 00:00:00':'2023-06-30 23:00:00']
test_set = traffic_full['2023-07-01 00:00:00':]
print('Proportion of train_set : {:.4f}'.format(len(train_set)/len(traffic_full['2022-06-03 00:00:00':])))
print('Proportion of valid_set : {:.4f}'.format(len(valid_set)/len(traffic_full['2022-06-03 00:00:00':])))
print('Proportion of test_set : {:.4f}'.format(len(test_set)/len(traffic_full['2022-06-03 00:00:00':])))

Proportion of train_set : 0.6227
Proportion of valid_set : 0.1876
Proportion of test_set : 0.1897


In [6]:
print(train_set.isnull().sum(), len(train_set))
print(valid_set.isnull().sum(),len(valid_set))
print(test_set.isnull().sum(),len(test_set))

date    16
time    16
flow    16
dtype: int64 7248
date    61
time    61
flow    61
dtype: int64 2184
date    342
time    342
flow    342
dtype: int64 2208


## 3. Normalise the data 

In [7]:
# Initialize the scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data's 'flow' feature
scaler.fit(train_set[['flow']])

# Transform the 'flow' feature in all datasets
train_set.loc[:, 'flow_scaled'] = scaler.transform(train_set[['flow']])
valid_set.loc[:, 'flow_scaled'] = scaler.transform(valid_set[['flow']])
test_set.loc[:, 'flow_scaled'] = scaler.transform(test_set[['flow']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.loc[:, 'flow_scaled'] = scaler.transform(train_set[['flow']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set.loc[:, 'flow_scaled'] = scaler.transform(valid_set[['flow']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.loc[:, 'flow_scaled'] = scaler.transform(test_set[['

## 4. Split the data into X and y

In [11]:
def create_sequences(data, input_length, forecast_horizon):
    """
    Creates input-output sequences for time series data for SVR (MultiOutputRegressor).
    
    Parameters:
    - data: pandas DataFrame containing the data. Must include the 'flow_scaled' column.
    - input_length: int, number of past time steps to include in each input sequence.
    - forecast_horizon: int, number of future steps to predict.
    
    Returns:
    - X: numpy array of shape (num_valid_samples, input_length)
    - y: numpy array of shape (num_valid_samples, forecast_horizon)
    """
    X, y = [], []
    
    # Iterate through the data to create sequences
    for i in range(input_length, len(data) - forecast_horizon + 1):
        # Extract the input sequence (flattened to a 1D array)
        X_seq = data.iloc[i - input_length:i]['flow_scaled'].values
        
        # Extract the target sequence (future steps)
        y_seq = data.iloc[i:i + forecast_horizon]['flow_scaled'].values
        
        # Check for NaN values in the input sequence and target sequence
        if not np.isnan(X_seq).any() and not np.isnan(y_seq).any():
            X.append(X_seq)
            y.append(y_seq)
        else:
            # Optionally, log or count the skipped sequences
            pass  # Simply skip sequences with NaNs
        
    # Convert to numpy arrays
    X = np.array(X)  # Shape: (num_valid_samples, input_length)
    y = np.array(y)  # Shape: (num_valid_samples, forecast_horizon)
    
    return X, y


## 5. Create X and y

#### We will use
* the last 24*N steps

*  to forecast current (0 step) and 5 steps ahead

In [9]:
# Define Input Sequence Lengths
input_lengths = [24 * i for i in range(1, 22)]  # [24, 48, ..., 168]

In [12]:
from collections import defaultdict
data_dict = defaultdict(dict)

for length in input_lengths:
    print(f"Processing input length: {length}")
    
    # Create sequences with forecast_horizon=6
    X_train, y_train = create_sequences(train_set, length, forecast_horizon=6)
    X_val, y_val = create_sequences(valid_set, length, forecast_horizon=6)
    X_test, y_test = create_sequences(test_set, length, forecast_horizon=6)
    
    # Store in the dictionary
    data_dict[length]['X_train'] = X_train
    data_dict[length]['y_train'] = y_train
    data_dict[length]['X_val'] = X_val
    data_dict[length]['y_val'] = y_val
    data_dict[length]['X_test'] = X_test
    data_dict[length]['y_test'] = y_test
    
    # Print shapes and ensure no NaNs
    print(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"  X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    print(f"  X_test shape: {X_test.shape}, y_test shape: {y_test.shape}\n")

Processing input length: 24
  X_train shape: (7145, 24), y_train shape: (7145, 6)
  X_val shape: (2007, 24), y_val shape: (2007, 6)
  X_test shape: (1719, 24), y_test shape: (1719, 6)

Processing input length: 48
  X_train shape: (7073, 48), y_train shape: (7073, 6)
  X_val shape: (1911, 48), y_val shape: (1911, 6)
  X_test shape: (1604, 48), y_test shape: (1604, 6)

Processing input length: 72
  X_train shape: (7001, 72), y_train shape: (7001, 6)
  X_val shape: (1815, 72), y_val shape: (1815, 6)
  X_test shape: (1508, 72), y_test shape: (1508, 6)

Processing input length: 96
  X_train shape: (6929, 96), y_train shape: (6929, 6)
  X_val shape: (1719, 96), y_val shape: (1719, 6)
  X_test shape: (1412, 96), y_test shape: (1412, 6)

Processing input length: 120
  X_train shape: (6857, 120), y_train shape: (6857, 6)
  X_val shape: (1623, 120), y_val shape: (1623, 6)
  X_test shape: (1316, 120), y_test shape: (1316, 6)

Processing input length: 144
  X_train shape: (6785, 144), y_train shap

## 6. Build SVR model

In [14]:
def build_svr_model(hyperparams):
    if hyperparams['kernel'] == 'poly':  # Ensure degree is used for 'poly' kernel
        model = MultiOutputRegressor(SVR(C=hyperparams['C'], epsilon=hyperparams['epsilon'], 
                                             kernel=hyperparams['kernel'], gamma=hyperparams['gamma'], 
                                             degree=hyperparams['degree']))
    elif hyperparams['kernel'] == 'linear':  # No need for gamma or degree for 'linear'
        model = MultiOutputRegressor(SVR(C=hyperparams['C'], epsilon=hyperparams['epsilon'], 
                                             kernel=hyperparams['kernel']))
    else: # For RBF, sigmoid, etc., gamma is necessary
        model = MultiOutputRegressor(SVR(C=hyperparams['C'], epsilon=hyperparams['epsilon'], 
                                             kernel=hyperparams['kernel'], gamma=hyperparams['gamma']))
    return model

## 7. Define the hyperparameter grid

In [22]:
from itertools import product

# Define hyperparameter grid with conditions on kernel types
hyperparameter_grid = {               
    'C': [1, 10, 100],              # Regularization parameter
    'epsilon': [0.01, 0.1, 0.2],    # Epsilon parameter
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient for RBF, poly, and sigmoid
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernels to try
    'degree': [2, 3, 4]             # Degree of the polynomial kernel (only for 'poly')
}

# Generate all combinations, respecting kernel-specific constraints
all_combinations = []

# Iterate over each kernel type and create the combinations accordingly
for kernel in hyperparameter_grid['kernel']:
    if kernel == 'linear':
        # For linear kernel, do not include 'gamma' and 'degree'
        for C, epsilon in product(hyperparameter_grid['C'], hyperparameter_grid['epsilon']):
            all_combinations.append((C, epsilon, None, kernel, None))  # No gamma, no degree
    elif kernel == 'poly':
        # For poly kernel, include 'gamma' and 'degree'
        for C, epsilon, gamma, degree in product(
            hyperparameter_grid['C'], 
            hyperparameter_grid['epsilon'], 
            hyperparameter_grid['gamma'], 
            hyperparameter_grid['degree']
        ):
            all_combinations.append((C, epsilon, gamma, kernel, degree))  # Include gamma and degree
    else:
        # For other kernels (rbf, sigmoid), include 'gamma' but not 'degree'
        for C, epsilon, gamma in product(
            hyperparameter_grid['C'], 
            hyperparameter_grid['epsilon'], 
            hyperparameter_grid['gamma']
        ):
            all_combinations.append((C, epsilon, gamma, kernel, None))  # No degree

# Display the resulting combinations
print(f"Total combinations: {len(all_combinations)}")
print(all_combinations[:10])  # Display the first 10 combinations


Total combinations: 279
[(1, 0.01, None, 'linear', None), (1, 0.1, None, 'linear', None), (1, 0.2, None, 'linear', None), (10, 0.01, None, 'linear', None), (10, 0.1, None, 'linear', None), (10, 0.2, None, 'linear', None), (100, 0.01, None, 'linear', None), (100, 0.1, None, 'linear', None), (100, 0.2, None, 'linear', None), (1, 0.01, 'scale', 'rbf', None)]


## 8. Defining manual grid search

In [23]:
import logging

logging.basicConfig(
    level=logging.INFO,  # Set the logging level to INFO
    format='%(asctime)s - %(message)s',  # Customize the log message format
    handlers=[
        logging.FileHandler('svr_updated.log'),  # Log messages to 'output.log'
        logging.StreamHandler()             # Also output to console/notebook
    ]
)

In [None]:
results = []

# Loop over each input length
for length in input_lengths:
    logging.info(f"Starting grid search for input length: {length}")
    
    X_train = data_dict[length]['X_train']
    y_train = data_dict[length]['y_train']
    X_val = data_dict[length]['X_val']
    y_val = data_dict[length]['y_val']
    
    best_mse = float('inf')
    best_params = {}
    best_model = None
    
    # Iterate through all combinations of hyperparameters
    for idx, combination in enumerate(all_combinations):
        hyperparams = {
            'C': combination[0],
            'epsilon': combination[1],
            'gamma': combination[2],
            'kernel': combination[3],
            'degree': combination[4]
        }
        
        logging.info(f"  Evaluating combination {idx + 1}/{len(all_combinations)}: {hyperparams}")

        # build the svr model
        model = build_svr_model(hyperparams)
        # train the model
        model.fit(X_train, y_train)
        # Validate the model on the validation set
        y_val_pred = model.predict(X_val)
        
        # Calculate MSE on the validation set
        current_best_mse = mean_squared_error(y_val, y_val_pred)
        logging.info(f"Validation MSE: {current_best_mse:.5f}")
        
        # If this combination gives a better validation MSE, update the best model
        if current_best_mse < best_mse:
            best_mse = current_best_mse
            best_params = hyperparams.copy()
            best_model = model
    
    # Store the best results
    results.append({
        'Input_Length': length,
        'Best_MSE': best_mse,
        'Best_Hyperparameters': best_params
    })
    
    logging.info(f"Completed grid search for input length: {length}")
    logging.info(f"  Best Validation MSE: {best_mse:.5f}")
    logging.info(f"  Best Hyperparameters: {best_params}\n")

2025-01-15 07:59:50,167 - Starting grid search for input length: 24
2025-01-15 07:59:50,170 -   Evaluating combination 1/279: {'C': 1, 'epsilon': 0.01, 'gamma': None, 'kernel': 'linear', 'degree': None}
2025-01-15 08:00:05,301 - Validation MSE: 0.00502
2025-01-15 08:00:05,302 -   Evaluating combination 2/279: {'C': 1, 'epsilon': 0.1, 'gamma': None, 'kernel': 'linear', 'degree': None}
2025-01-15 08:00:08,106 - Validation MSE: 0.00546
2025-01-15 08:00:08,108 -   Evaluating combination 3/279: {'C': 1, 'epsilon': 0.2, 'gamma': None, 'kernel': 'linear', 'degree': None}
2025-01-15 08:00:08,991 - Validation MSE: 0.00996
2025-01-15 08:00:08,993 -   Evaluating combination 4/279: {'C': 10, 'epsilon': 0.01, 'gamma': None, 'kernel': 'linear', 'degree': None}
2025-01-15 08:00:57,896 - Validation MSE: 0.00502
2025-01-15 08:00:57,897 -   Evaluating combination 5/279: {'C': 10, 'epsilon': 0.1, 'gamma': None, 'kernel': 'linear', 'degree': None}
2025-01-15 08:01:05,912 - Validation MSE: 0.00546
2025-01-

## 9. Storing Results

In [None]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Expand the hyperparameters dictionary into separate columns for clarity
hyperparams_df = results_df['Best_Hyperparameters'].apply(pd.Series)

# Combine the main dataframe with hyperparameters
final_results_df = pd.concat([results_df.drop('Best_Hyperparameters', axis=1), hyperparams_df], axis=1)

# Display the final dataframe
print("Final Results DataFrame:")
final_results_df