# GP surrogate model

This notebook creates a surrogate model of EPANET's water quality solver using Gaussian Process (GP) regression.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.colors
default_colors = plotly.colors.qualitative.Plotly
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ConstantKernel as C
from bayesian_wq_calibration.epanet import build_model, sensor_model_id, epanet_simulator, set_reaction_parameters
from bayesian_wq_calibration.calibration import decision_variables_to_dict, generate_samples
from bayesian_wq_calibration.constants import TIMESERIES_DIR, RESULTS_DIR
from bayesian_wq_calibration.data import bulk_temp_adjust

### Load data

Load operational data for selected sensing period.

In [None]:
data_period = 20 # 20 calibration events (as at 30 October 2024)
wq_sensors_used = 'kiosk only' # 'kiosk only', 'kiosk + hydrant'
demand_resolution = 'wwmd' # 'dma', 'wwmd'
try:
    flow_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-flow.csv")
    pressure_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-pressure.csv")
    wq_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-wq.csv", low_memory=False)
    cl_df = wq_df[wq_df['data_type'] == 'chlorine']
except:
    print(f"Data period {data_period} does not exist.")

Surrogate model data period.

In [None]:
surrogate_days = 2

n_total = len(flow_df['datetime'].unique())
n_surrogate = surrogate_days * 24 * 4
surrogate_range = range(n_surrogate)
surrogate_datetime = flow_df['datetime'].unique()[list(surrogate_range)]
total_range = range(n_total)
total_datetime = flow_df['datetime'].unique()[list(total_range)]

Bulk decay.

In [None]:
bulk_coeff = -0.85 # day^-1 (from bottle tests)
field_temp = wq_df[wq_df['data_type'] == 'temperature']['mean'].mean()
bulk_coeff = bulk_temp_adjust(bulk_coeff, field_temp)

Wall decay grouping.

In [None]:
# see notebook `02-pipe-grouping-exploration.ipynb` for details on pipe groups
grouping = 'material-age-velocity'

# load ga results to get param_mean data
ga_results_df = pd.read_excel(RESULTS_DIR / 'wq/ga_calibration.xlsx', sheet_name=grouping)
ga_results_df = ga_results_df[(ga_results_df['data_period'] == data_period) & (ga_results_df['demand_resolution'] == demand_resolution) & (ga_results_df['wq_sensors_used'] == wq_sensors_used)]
ga_results = ga_results_df[[col for col in ga_results_df.columns if col.startswith('G')]].values[0]

grouping_data = {
    'single': {
        'param_group': ['G0'],
        'param_bounds': [(-0.5, 0.0)],
        'param_mean': ga_results
    },
    'material-only': {
        'param_group': ['G0', 'G1'],
        'param_bounds': [(-1.0, -0.01), (-0.5, -0.01), (-0.15, -0.01)],
        'param_mean': ga_results
    },
    'material-age-diameter': {
        'param_group': ['G0', 'G1', 'G2', 'G3', 'G4', 'G5'],
        'param_bounds': [(-1.0, -0.01), (-1.0, -0.01), (-0.5, -0.01), (-0.5, -0.01), (-0.15, -0.01), (-0.15, -0.01)],
        'param_mean': ga_results
    },
    'material-age-velocity': {
        'param_group': ['G0', 'G1', 'G2', 'G3', 'G4', 'G5'],
        'param_bounds': [(-1.0, -0.01), (-1.0, -0.01), (-0.5, -0.01), (-0.5, -0.01), (-0.15, -0.01), (-0.15, -0.01)],
        'param_mean': ga_results
    }
}

# extract parameter data
param_data = grouping_data[grouping]
param_group = param_data['param_group']
param_bounds = param_data['param_bounds']
param_mean = param_data['param_mean']

### Surrogate model

**EPANET simulator**

Build water model using `wntr`.

In [None]:
wn = build_model(flow_df[flow_df['datetime'].isin(surrogate_datetime)], pressure_df[pressure_df['datetime'].isin(surrogate_datetime)], cl_df[cl_df['datetime'].isin(surrogate_datetime)], sim_type='chlorine', demand_resolution=demand_resolution, bulk_coeff=bulk_coeff)

Get mean velocities (for 'material-velocity' grouping).

Define simualtor function.

In [None]:
def simulator(cl_df, params, wn, grouping):
    wall_coeffs = decision_variables_to_dict(grouping, params)
    _wn = set_reaction_parameters(wn, grouping, wall_coeffs, bulk_coeff)
    
    sim_type = 'chlorine'
    sim_results = epanet_simulator(_wn, sim_type, cl_df)
    cl_sim = sim_results.chlorine
    
    sensor_data = sensor_model_id('wq')
    cl_sim = cl_sim[sensor_data['model_id'].unique()]
    name_mapping = sensor_data.set_index('model_id')['bwfl_id'].to_dict()
    cl_sim = cl_sim.rename(columns=name_mapping)

    cl_sim = cl_sim.T
    cl_sim.columns = [f't_{idx+1}' for idx in range(cl_sim.shape[1])]

    cl_sim = cl_sim.drop(index=['BW1', 'BW4'], errors='ignore') # remove inlet sensors
    
    return cl_sim

In [None]:
cl_simulator = simulator(cl_df[cl_df['datetime'].isin(surrogate_datetime)], param_mean, wn, grouping)
sensor_names = cl_simulator.index.to_numpy()

**Design of experiments**

Call `generate_samples` function, which takes the following inputs:
- `sampling_method` (default = latin hypercube sampling)
- `dist_type` (default = truncated normal)
- `rel_uncertainty` (default = 50% of parameter mean)

In [None]:
n_samples = [10, 25, 50, 100, 200]
n_samples_idx = 3

X = generate_samples(param_mean, param_bounds, param_group, n_samples[n_samples_idx], sampling_method='lhs', dist_type='truncated normal', rel_uncertainty=0.75, plot=True)
Y = np.array([
    simulator(cl_df[cl_df['datetime'].isin(surrogate_datetime)], params, wn, grouping)
    for params in X
])

**Gausian process model training**

Training procedure using five-fold cross-validation and `scikit-learn` modules. The following kernel's can be used:
- Radial basis function (RBF)
- Matern
- Rational quadratic

Note: a separate GP is trained for each of the **7** sensors.

In [None]:
sensor = 'BW2' # 'BW2', 'BW3', 'BW5', 'BW6', 'BW7', 'BW9', 'BW12'
s = np.where(sensor_names == sensor)[0][0]
Y_s = Y[:, s, :].reshape(Y.shape[0], Y.shape[2])

In [None]:
# train/validate and test datasets
n_0 = round(0.8*len(X))
X_0 = X[:n_0, :]
Y_s_0 = Y_s[:n_0, :]

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ConstantKernel as C

def GPRegressor(kernel_type='RBF', nu=1.5, n_restarts=50, normalize_y=True):

    if kernel_type == 'RBF':
        kernel = C(1.0, (1e-1, 1e5)) * RBF(1.0, (1e-1, 1e10))
    elif kernel_type == 'Matern':
        kernel = C(1.0, (1e-1, 1e5)) * Matern(1.0, (1e-1, 1e10), 0.5)
    else:
        raise ValueError(f"Unknown kernel type: {kernel_type}")

    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=n_restarts, normalize_y=normalize_y)
    
    return gp

Cross-validation.

In [None]:
# setup parameters
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)
kernel = 'RBF' # 'RBF', 'Matern'
gp = GPRegressor(kernel_type=kernel, nu=0.5, n_restarts=50, normalize_y=True)

# cross-validation loop
hyperparameter_performance = []
for fold, (train_idx, validate_idx) in enumerate(kf.split(X_0)):

    X_train, X_validate = X_0[train_idx], X_0[validate_idx]
    Y_train, Y_validate = Y_s_0[train_idx], Y_s_0[validate_idx]

    gp.fit(X_train, Y_train)
    Y_pred = gp.predict(X_validate)

    rmse = np.sqrt(mean_squared_error(Y_validate, Y_pred))
    mae = mean_absolute_error(Y_validate, Y_pred)
    maxae = np.max(np.abs(Y_validate - Y_pred))
    hyperparameter_performance.append({
        "fold": fold + 1,
        "rmse": rmse,
        "mae": mae,
        "maxae": maxae,
        "length_scale": gp.kernel_.get_params()['k2__length_scale'],
        "variance": gp.kernel_.get_params()['k1__constant_value']
    })

    print(f"Fold {fold + 1} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, MaxAE: {maxae:.4f}")

Re-train GP model with entire dataset.

In [None]:
gp.fit(X_0, Y_s_0)
print(gp.kernel_.get_params())

Test model performance.

In [None]:
X_1 = X[n_0:]
Y_s_1 = Y_s[n_0:, :]
Y_pred, sigma = gp.predict(X_1, return_std=True)
Y_upper = Y_pred + 1.96 * sigma
Y_lower = Y_pred - 1.96 * sigma

Test results plotting.

In [None]:
# performance metrics
rmse = np.sqrt(np.mean((Y_s_1 - Y_pred) ** 2))
print(f"Root mean squared error: {rmse}")
mae = np.mean(np.abs(Y_s_1 - Y_pred))
print(f"Mean absolute error: {mae}")
maxae = np.max(np.abs(Y_s_1 - Y_pred))
print(f"Maximum absolute error: {maxae}")
r2 = gp.score(X_1, Y_s_1)
print(f"Kernel: {kernel}, r^2 Score: {r2}")

# parity plot of surrogate v. simulator
fig = go.Figure(data=go.Scatter(
    x=Y_s_1.flatten(),
    y=Y_pred.flatten(),
    mode='markers',
    marker=dict(size=6, opacity=0.6),
))
fig.update_layout(
    xaxis_title="Simulator [mg/L]",
    yaxis_title="Surrogate [mg/L]",
    template="simple_white",
    width=550,
    height=450
)
fig.show()

# histogram plot of errors
errors = (Y_s_1 - Y_pred).flatten()
fig = go.Figure(data=[go.Histogram(x=errors, nbinsx=40)])
fig.update_layout(
    xaxis_title="Error [mg/L]",
    yaxis_title=f"Frequency (n={len(errors)})",
    template="simple_white",
    width=600,
    height=400
)
fig.show()

# cdf plot of absolute errors
absolute_errors = np.abs(Y_s_1 - Y_pred).flatten()
sorted_errors = np.sort(absolute_errors)
cdf_values = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=sorted_errors,
    y=cdf_values,
    mode='lines',
))
fig.update_layout(
    xaxis_title="Absolute Error [mg/L]",
    yaxis_title="Cumulative distribution",
    template="simple_white",
    width=600,
    height=400
)
fig.show()

In [None]:
fig = go.Figure()

actual_data = Y_s_1
predicted_data = Y_pred

for exp_idx in range(len(X)-n_0):
    color = default_colors[exp_idx % len(default_colors)]
    fig.add_trace(
        go.Scatter(
            x=surrogate_datetime,
            y=actual_data[exp_idx, :],
            mode='lines',
            name=f"Simulator (Exp {exp_idx + 1})",
            line=dict(color=color, dash='solid'),
            showlegend=True
        )
    )
    fig.add_trace(
        go.Scatter(
            x=surrogate_datetime,
            y=predicted_data[exp_idx, :],
            mode='lines',
            name=f"Surrogate (Exp {exp_idx + 1})",
            line=dict(color=color, dash='dash'),
            showlegend=True
        )
    )
fig.update_yaxes(
    title_text="Chlorine [mg/L]",
    rangemode="tozero"
)
fig.update_layout(
    height=600,  # Fixed height since there's only one plot
    template='simple_white',
    legend_title_text='',
    title=f"GP model validation sensor {sensor}"
)
fig.show()