# Genetic algorithm model calibration

This notebook applies a genetic algorithm to calibrate wall decay coefficients using data from each water quality sensing period.

In [10]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors
default_colors = plotly.colors.qualitative.Plotly
from bayesian_wq_calibration.simulation import build_model, sensor_model_id
from bayesian_wq_calibration.ga import evaluate
from bayesian_wq_calibration.constants import TIMESERIES_DIR
import random
from deap import base, creator, tools, algorithms
from functools import partial
import multiprocessing

### Load data and build water model

Load operational data for selected sensing period.

In [11]:
data_period = 18 # 19 calibration events (as at 30 September 2024)
try:
    flow_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-flow.csv")
    pressure_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-pressure.csv")
    wq_df = pd.read_csv(TIMESERIES_DIR / f"processed/{str(data_period).zfill(2)}-wq.csv", low_memory=False)
    cl_df = wq_df[wq_df['data_type'] == 'chlorine']
    print(f"Calibrating wall model decay coefficients for data period: {data_period}...")
except:
    print(f"Data period {data_period} does not exist.")

Calibrating wall model decay coefficients for data period: 18...


Split sensing data into train and test datasets.

In [12]:
n_total = len(flow_df['datetime'].unique())
n_train = 7 * 24 * 4

train_range = range(n_train)
train_datetime = flow_df['datetime'].unique()[list(train_range)]

test_range = range(n_train, n_total)
test_datetime = flow_df['datetime'].unique()[list(test_range)]

total_range = range(n_total)
total_datetime = flow_df['datetime'].unique()[list(total_range)]

Build water model via WNTR module.

In [13]:
wn_train = build_model(flow_df[flow_df['datetime'].isin(train_datetime)], pressure_df[pressure_df['datetime'].isin(train_datetime)], cl_df[cl_df['datetime'].isin(train_datetime)], sim_type='chlorine', demand_resolution='wwmd')

Set grouping type and initialize wall decay coefficients.

In [14]:
grouping = 'single'
wall_coeffs_ub = -5.0
wall_coeffs_lb = 0.0

if grouping == 'single':
    n_coeffs = 1
elif grouping == 'diameter-based':
    n_coeffs = 4
elif grouping == 'roughness-based':
    n_coeffs = 8
elif grouping == 'material-based':
    n_coeffs = 3

### Initialize GA using DEAP module.

Assign default parameters.

In [15]:
POPULATION_SIZE = 25
GENERATIONS = 50
P_CROSSOVER = 0.7
P_MUTATION = 0.2
TOURNAMENT_SIZE = 3
HALL_OF_FAME_SIZE = 1

Set up DEAP structures.

In [16]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # minimize the objective function
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, wall_coeffs_lb, wall_coeffs_ub)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=n_coeffs)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.1, indpb=0.2)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("select", tools.selTournament, tournsize=TOURNAMENT_SIZE)
# pool = multiprocessing.Pool()
# toolbox.register("map", pool.map)



### Run GA

In [17]:
def main(wn, cl_df, grouping):

    # use functools.partial to preload the current period's data into the evaluate function
    evaluation_function = partial(evaluate, wn=wn, cl_df=cl_df, grouping=grouping)
    toolbox.register("evaluate", evaluation_function)

    # initialize population
    pop = toolbox.population(n=POPULATION_SIZE)

    # define hall of fame to store the best individuals
    hof = tools.HallOfFame(HALL_OF_FAME_SIZE)

    # define statistics
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # run the genetic algorithm for this data period
    pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=P_CROSSOVER, mutpb=P_MUTATION, ngen=GENERATIONS, stats=stats, halloffame=hof, verbose=True)
    
    return pop, logbook, hof

In [None]:
if __name__ == "__main__":
    pop, log, hof = main(wn_train, cl_df[cl_df['datetime'].isin(train_datetime)], grouping)
    print("Best individual is: %s\nwith fitness: %s" % (hof[0], hof[0].fitness))

gen	nevals	avg      	min     	max      
0  	25    	0.0557124	0.014181	0.0738958
1  	23    	0.0395566	0.0117793	0.0601787
2  	21    	0.0263751	0.0134383	0.0531956
3  	24    	0.017352 	0.0115357	0.0302427
4  	21    	0.0135273	0.0111866	0.0147409
5  	18    	0.0127191	0.0111866	0.013836 
6  	18    	0.0119447	0.0107971	0.015121 
