# Pipe group classification for wall decay coefficients

This notebook develops a systematic approach for grouping pipes to assign wall decay coefficients in the Bristol Water Field Lab's water quality model. The classification incorporates the following.

Physical parameters:
- pipe material, age, and diameter

Hydraulic conditions:
- mean flow velocity
- self-cleaning capacity (SCC) threshold

The resulting pipe groups inform both deterministic (genetic algorithm) and probabilistic (Bayesian inference) calibration of wall decay coefficients. The analysis concludes by mapping these groups against the network's sensor-observable paths to evaluate monitoring coverage.

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.colors
from plotly.subplots import make_subplots
import plotly.io as pio
default_colors = plotly.colors.qualitative.Plotly
from datetime import datetime
from bayesian_wq_calibration.data import load_network_data
from bayesian_wq_calibration.plotting import plot_network_features
from bayesian_wq_calibration.constants import NETWORK_DIR, INP_FILE, TIMESERIES_DIR, RESULTS_DIR
from bayesian_wq_calibration.epanet import build_model, epanet_simulator, sensor_model_id
from bayesian_wq_calibration.calibration import get_observable_paths

pio.renderers.default = "notebook+pdf"

### Load network data

Get EPANET and GIS data.

In [2]:
wdn = load_network_data(NETWORK_DIR / INP_FILE)
link_df = wdn.link_df
node_df = wdn.node_df
net_info = wdn.net_info
gis_df = pd.read_excel(NETWORK_DIR / 'gis_data.xlsx')

Run EPANET simulation to get hydraulic data.

In [3]:
data_period = 16 # 20 calibration events (as at 30 October 2024)
try:
    flow_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-flow.csv")
    pressure_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-pressure.csv")
    wq_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-wq.csv", low_memory=False)
    cl_df = wq_df[wq_df['data_type'] == 'chlorine']
except:
    print(f"Data period {data_period} does not exist.")

# build wntr model
demand_resolution = 'wwmd'
wn = build_model(flow_df, pressure_df, cl_df, sim_type='hydraulic', demand_resolution=demand_resolution)

# run epanet simulator
sim = epanet_simulator(wn, 'velocity', cl_df)
vel = sim.velocity.T

In [None]:
# get velocity features
vel_mean = vel.mean(axis=1)
vel_max = vel.max(axis=1) 
scc_thresh = 0.2
vel_scc = (vel_max > scc_thresh).astype(str)

vel_df = pd.DataFrame({
    'model_id': vel_mean.index,
    'vel_mean': vel_mean.values,
    'vel_max': vel_max.values,
    'vel_scc': vel_scc.values
})

Merge physical and operational pipe properties to single dataframe.

In [None]:
feature_df = link_df.copy()
feature_df = feature_df[feature_df['link_type'] != 'valve']

# merge dataframes
feature_df = feature_df.merge(
    gis_df[['model_id', 'material', 'age']], 
    left_on='link_ID', right_on='model_id', how='left'
).drop(columns=['model_id']).rename(columns={'link_ID': 'model_id'})
feature_df = feature_df[['model_id'] + [col for col in feature_df.columns if col != 'model_id']]
feature_df = feature_df.merge(vel_df, on='model_id', how='left')

# compute additional features
feature_df['mean_residence_time_h'] = feature_df.apply(lambda row: np.nan if row['vel_mean'] == 0 else row['length'] / row['vel_mean'] / 3600, axis=1)
feature_df['mean_residence_time_h'] = feature_df['mean_residence_time_h'].where(feature_df['mean_residence_time_h'] <= 24, 0)
feature_df['age'] = feature_df['age'].where(feature_df['age'] >= 1800, np.nan)
feature_df['age'] = datetime.now().year - feature_df['age']
feature_df['age'] = feature_df['age'].fillna(50).astype(int)
feature_df['length_normalized'] = feature_df['length'] / feature_df['length'].sum()

feature_df

### Explore pipe features

Material grouping.

In [None]:
M1 = ['CI', 'SI', 'Pb', 'DI', 'ST']
M2 = ['AC']
M3 = ['HPPE', 'HPPE+FOIL', 'LDPE', 'MDPE', 'MDPE+FOIL', 'PE100+Skin', 'PVC', 'Unknown']

feature_df['material_group'] = [
    'M1' if material in M1 else 
    'M2' if material in M2 else
    'M3' if material in M3 else
    np.nan
    for material in feature_df['material']
]

# plotting
feature = 'material_group'
feature_order = ['M1', 'M2', 'M3']
feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]

fig = plot_network_features(feature_df, feature=feature)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='pipe length [%]',
    template="simple_white",
    width=1200,
    height=450
)
fig.show()

Age grouping.

In [None]:
age_thresh = [20, 70]
feature_df['age_group'] = [
    'A1' if age <= age_thresh[0] else 
    'A2' if age > age_thresh[0] and age <= age_thresh[1] else 
    'A3' if age > age_thresh[1] else 
    np.nan
    for age in feature_df['age']
]

# plotting
feature = 'age_group'
feature_order = ['A1', 'A2', 'A3']
feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]


fig = plot_network_features(feature_df, feature=feature)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='pipe length [%]',
    template="simple_white",
    width=1200,
    height=450
)
fig.show()

Diameter grouping.

In [None]:
diameter_thresh = [0.1]
feature_df['diameter_group'] = [
    'D1' if diameter <= diameter_thresh[0] else 
    'D2' if diameter > diameter_thresh[0] else 
    np.nan
    for diameter in feature_df['diameter']
]

# plotting
feature = 'diameter_group'
feature_order = ['D1', 'D2']
feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]

fig = plot_network_features(feature_df, feature=feature)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.4), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="pipe length", width=0.4), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='pipe length [%]',
    template="simple_white",
    width=1200,
    height=450
)
fig.show()

Mean velocity grouping.

In [None]:
median = feature_df['vel_mean'].quantile(0.55)

feature_df['vel_mean_group'] = [
    'V1' if vel_mean <= median else 
    'V2' if vel_mean > median else 
    np.nan
    for vel_mean in feature_df['vel_mean']
]

feature = 'vel_mean_group'
feature_order = ['V1', 'V2']
feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]


fig = plot_network_features(feature_df, feature=feature)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='pipe length [%]',
    template="simple_white",
    width=1200,
    height=450
)
fig.show()

Self-cleaning capacity feature.

In [None]:
feature = 'vel_scc'
feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_order = feature_by_count.index
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]

fig = plot_network_features(feature_df, feature=feature)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='percent of network [%]',
    template="simple_white",
    width=1200,
    height=450
)
fig.show()

### Select pipe grouping for calibration

Three pipe grouping options:
1. material
2. material --> age
3. material --> age --> mean velocity

In [None]:
# option 1: material
group_id = ['G1', 'G2']
material = [['M1'], ['M2', 'M3']]
group_1_df = pd.DataFrame({
    'group_id': group_id,
    'material': material
})

group_id = ['G1', 'G2', 'G3']
material = [['M1'], ['M1'], ['M2', 'M3']]


# option 2: material --> age
age = [['A3'], ['A1', 'A2'], ['A1', 'A2', 'A3']]
group_2_df = pd.DataFrame({
    'group_id': group_id,
    'material': material,
    'age': age
})

group_id = ['G1', 'G2', 'G3', 'G4', 'G5']
material = [['M1'], ['M1'], ['M1'], ['M1'], ['M2', 'M3']]
age = [['A3'], ['A3'], ['A1', 'A2'], ['A1', 'A2'], ['A1', 'A2', 'A3']]

# option 3: material --> age --> velocity
vel_mean = [['V1'], ['V2'], ['V1'], ['V2'], ['V1', 'V2']]
group_3_df = pd.DataFrame({
    'group_id': group_id,
    'material': material,
    'age': age,
    'vel_mean': vel_mean
})

Assign pipe groups to `feature_df`.

In [None]:
def assign_group(row, group_df, group_option):
    for _, group_row in group_df.iterrows():
        if row['material_group'] in group_row['material']:
            if group_option == 1:
                return group_row['group_id']
            if row['age_group'] in group_row.get('age', []):
                if group_option == 2:
                    return group_row['group_id']
                elif group_option == 3:
                    if row['vel_mean_group'] in group_row.get('vel_mean', []):
                        return group_row['group_id']
    return None

feature_df['material'] = feature_df.apply(lambda row: assign_group(row, group_1_df, 1), axis=1)
feature_df['material-age'] = feature_df.apply(lambda row: assign_group(row, group_2_df, 2), axis=1)
feature_df['material-age-velocity'] = feature_df.apply(lambda row: assign_group(row, group_3_df, 3), axis=1)

In [None]:
# select feature to plot
feature = 'material-age-velocity'

if feature == 'material':
    display(group_1_df)
    group_id = ['G1', 'G2']
elif feature == 'material-age':
    display(group_2_df)
    group_id = ['G1', 'G2', 'G3']
elif feature == 'material-age-velocity':
    display(group_3_df)
    group_id = ['G1', 'G2', 'G3', 'G4', 'G5']

feature_by_count = feature_df[feature].value_counts()
feature_by_length = feature_df.groupby(feature)['length_normalized'].sum()
feature_order = feature_by_count.index
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]

fig = plot_network_features(feature_df, feature=feature)
fig.write_image("fig6-1a.pdf", 
                engine="kaleido",
                scale=1)

fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.25)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature}',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='percent of network [%]',
    template="simple_white",
    width=1200,
    height=450,
    xaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    xaxis2=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis2=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
)
fig.update_xaxes(categoryorder='array', categoryarray=group_id)
fig.show()

Save `feature_df` to csv file.

In [None]:
group_df = feature_df[['model_id', 'link_type', 'material', 'material-age', 'material-age-velocity']]
group_df.to_csv(RESULTS_DIR / 'wq/pipe_groups.csv', index=False)

### Observable path

Compute flow paths across the simulation period.

In [None]:
sim = epanet_simulator(wn, 'flow', cl_df)
flow_df = sim.flow.T.iloc[:, :96] # only need 24-hour period

Plot pipe group statistics along the observable path, i.e., pipes connected by water quality sensors.

In [None]:
feature = 'material-age-velocity'
wq_sensors_used = 'kiosk + hydrant'

if feature == 'material':
    display(group_1_df)
elif feature == 'material-age':
    display(group_2_df)
elif feature == 'material-age-velocity':
    display(group_3_df)

observable_path = get_observable_paths(flow_df, link_df, wq_sensors_used)
feature_df_temp = link_df.copy()
feature_df_temp['observable_path'] = observable_path
feature_df_temp = feature_df_temp[feature_df_temp['link_type'] != 'valve']
feature_df['observable_path'] = feature_df_temp['observable_path']
observable_df = feature_df[feature_df['observable_path']]

feature_by_count = observable_df[feature].value_counts()
feature_by_length = observable_df.groupby(feature)['length_normalized'].sum()
feature_order = feature_by_count.index
feature_by_count = feature_by_count[feature_order]
feature_by_length = feature_by_length[feature_order]

# histogram plots
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_count.values, marker=dict(color=default_colors[0]), name='pipe count', width=0.5), row=1, col=1)
fig.add_trace(go.Bar(x=feature_order, y=feature_by_length.values, marker=dict(color=default_colors[1]), name="percent of network", width=0.5), row=1, col=2)
fig.update_layout(
    title=f'histograms for feature: {feature} (observable paths only)',
    xaxis_title_text='',
    xaxis2_title_text='',
    yaxis_title_text='pipe count',
    yaxis2_title_text='percent of network [%]',
    template="simple_white",
    width=1200,
    height=450,
    xaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    xaxis2=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis2=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
)
fig.update_xaxes(categoryorder='array', categoryarray=group_id)
fig.show()

In [None]:
feature_by_length

In [None]:
# network plot
fig = plot_network_features(
    feature_df, 
    feature=feature,
    observable=True,
    flow_df=flow_df,
    wq_sensors_used=wq_sensors_used
)
fig.write_image("fig6-1b.pdf", 
                engine="kaleido",
                scale=1)