This notebook uses a vanilla (basic) Bayesian optimization algorithm to tackle an urban travel demand (i.e., origin-destination, OD) calibration problem. The traffic simulations are based on the SUMO simulator.


# SUMO configuration


Mount GDrive


If you are working w/ colab rather than a jupyterlab notebook this drive mounting and sumo installation will need to be done every time you restart the runtime.


In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

**Setup venv**

In [2]:
# %python -m venv .venv
# %source .venv/bin/activate

Install SUMO


In [3]:
# %sudo add-apt-repository -y ppa:sumo/stable
# %sudo apt-get update
# %sudo apt-get -y install sumo sumo-tools sumo-doc &

Set sumo env vars


In [1]:
# Set environment variable
import os
import sys
os.environ['SUMO_HOME'] = '/usr/share/sumo'
os.environ['LIBSUMO_AS_TRACI'] = '1' #Optional: for a huge performance boost (~8x) with Libsumo (No GUI)

if "SUMO_HOME" in os.environ:
    tools = os.path.join(os.environ["SUMO_HOME"], "tools")
    sys.path.append(tools)
else:
    sys.exit("Please declare the environment variable 'SUMO_HOME'")
#import traci


# Macros / utils


In [2]:
base_path = "/app"

# if base_path has a space in it, the sumo code will not work
if ' ' in base_path:
    raise ValueError("base_path should not contain spaces")

os.chdir(base_path)

In [None]:
# install missing packages
%pip install -r requirements.txt

In [4]:
import os
import numpy as np
import pandas as pd
import pprint
import matplotlib.pyplot as plt
from pathlib import Path
import multiprocessing as mp

import torch
from torch.quasirandom import SobolEngine

from botorch import fit_gpytorch_mll
from botorch.acquisition import qLogExpectedImprovement
from botorch.models import SingleTaskGP
from botorch.models.transforms import Standardize
from botorch.optim import optimize_acqf
from botorch.sampling.stochastic_samplers import StochasticSampler
from botorch.utils.transforms import unnormalize, normalize

from gpytorch.constraints import Interval
from gpytorch.kernels import MaternKernel, ScaleKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood


from bayesian_optimization.helpers import (load_kwargs_config, 
                    compute_nrmse_counts_all_edges, 
                    parse_loop_data_xml_to_pandas, 
                    create_taz_xml,
                    simulate_od,
                    od_xml_to_df,
                    )

In [None]:
config = load_kwargs_config(base_path, "gridsearch")
Path(config["simulation_run_path"]).mkdir(parents=True, exist_ok=True)
pprint.pprint(dict(config))

# Create GT (ground truth) scenario


In [None]:
# Get Ground Truth OD + fixed routes
print(f"Reading: {config['file_gt_od']}")
gt_od_df = od_xml_to_df(config["file_gt_od"])

print(f"Reading: {config['fixed_routes']}")
routes_df = pd.read_csv(config["fixed_routes"], index_col=0)

# if config["edge_selection"] exists
if "edge_selection" in config:
    if not os.path.exists(config["edge_selection"]):
        edge_selection = None
    else:
        print(f"Reading: {config['edge_selection']}")
        edge_selection = pd.read_csv(config["edge_selection"], header=None)
        edge_selection.columns = ["edge_id"]
        edge_selection = edge_selection["edge_id"].tolist()
else:
    edge_selection = None


Simulate the GT scenario to obtain the GT traffic statistics


In [None]:
simulation_gt_run_path =f'{config["simulation_run_path"]}/ground_truth'
prefix_output_gt = f'{simulation_gt_run_path}/sim'
sim_edge_out_gt = f'{prefix_output_gt}_{config["EDGE_OUT_STR"]}'
new_od_xml = f'{simulation_gt_run_path}/od.xml'

Path(simulation_gt_run_path).mkdir(parents=True, exist_ok=True)

base_od = gt_od_df.copy()
gt_od_vals = gt_od_df['count'].astype(float).to_numpy()
curr_od = gt_od_vals.copy()
base_od['count'] = curr_od
base_od = base_od.rename(columns={'fromTaz':'from', 'toTaz':'to'})        
create_taz_xml(new_od_xml, base_od, config["od_duration_sec"], base_path)
print(base_od)

# Run simulation
simulate_od(new_od_xml, 
            prefix_output_gt, 
            base_path, 
            config["net_xml"], 
            config["taz2edge_xml"], 
            config["additional_xml"],
            routes_df,
            config["sim_end_time"],
            config["TRIPS2ODS_OUT_STR"])



Read and process the GT simulation outputs


In [None]:
df_edge_gt, _, _ = parse_loop_data_xml_to_pandas(base_path, sim_edge_out_gt, prefix_output_gt, config["SUMO_PATH"], edge_list=edge_selection)
# picking at edges as GT edges
num_gt_edges = df_edge_gt.shape[0]
print("Number of GT edges:",num_gt_edges)
gt_edge_data = df_edge_gt\
    .sort_values(by=['interval_nVehContrib'], ascending=False)\
    .iloc[:num_gt_edges]

print(sim_edge_out_gt)
print(gt_edge_data.head())

# Optimization

Bayesian optimization utils / helpers


In [None]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)
dtype = torch.double

dim_od = gt_od_df.shape[0]
print(dim_od)

bounds = torch.tensor([
    [ 0 for _ in range(dim_od)],
    [ 2000 for _ in range(dim_od)]
], device=device, dtype=dtype) 

## Create and simulate a full-grid search.


In [None]:
# full grid search
n_full_search = 201
candidates = []

# print(dim_od)
for i in range(dim_od):
    candidates.append(torch.linspace(0,1,n_full_search))

search_space = torch.meshgrid(candidates,indexing="ij")
search_space = torch.stack(search_space , 0)
search_space.shape
search_space = search_space.view(dim_od, -1)
search_space = search_space.transpose(0,1)
search_space = search_space.to(device)

# map the normalized into the original parameter space
train_X0 = unnormalize(search_space, bounds)
train_X0 = train_X0[1:,:]

# delete rows if contains 0
train_X0 = train_X0[~(train_X0 == 0).any(axis=1)]

print(f"train_X0 shape = {train_X0.shape}")
print(train_X0)


In [11]:
import multiprocessing as mp
import numpy as np
import torch
from pathlib import Path

# Function to write logs safely
def log_results(file_handle, log_data, lock):
    with lock:  # Use a lock to ensure safe writing to the file
        file_handle.write(log_data + '\n')
        file_handle.flush()  # Ensure data is written immediately

def run_simulation(od_matrix, i, config, gt_od_df, base_path, routes_df, gt_edge_data, edge_selection, lock):
    print(f"########### OD: {i} ###########")
    print(od_matrix)
    
    simulation_run_path_grid = f'{config["simulation_run_path"]}/full_search'
    Path(simulation_run_path_grid).mkdir(parents=True, exist_ok=True)
    
    new_od_xml = f"{simulation_run_path_grid}/grid_od_{config['network_name']}_{i}.xml"
    prefix_output_grid = f'{simulation_run_path_grid}/grid_{i}'
    
    curr_od = np.array(od_matrix)
    
    print(f'Total expected GT demand: {np.sum(curr_od)}')

    # Create OD XML file
    base_od = gt_od_df.copy()
    base_od['count'] = curr_od
    base_od['count'] = [round(elem, 1) for elem in base_od['count']]     
    base_od = base_od.rename(columns={'fromTaz': 'from', 'toTaz': 'to'})        
    create_taz_xml(new_od_xml, base_od, config["od_duration_sec"], base_path)
    
    # Simulate OD
    simulate_od(new_od_xml, 
                prefix_output_grid, 
                base_path, 
                config["net_xml"], 
                config["taz2edge_xml"], 
                config["additional_xml"],
                routes_df,
                config["sim_end_time"],
                config["TRIPS2ODS_OUT_STR"])
    
    # Compute loss
    sim_edge_out = f'{base_path}/{prefix_output_grid}_{config["EDGE_OUT_STR"]}'
    curr_loop_stats, _, _ = parse_loop_data_xml_to_pandas(base_path, sim_edge_out, prefix_output_grid, config["SUMO_PATH"], edge_list=edge_selection)
    curr_loss = compute_nrmse_counts_all_edges(gt_edge_data, curr_loop_stats)
    
    # Open log file in append mode inside each process
    log_file_path = f'{config["simulation_run_path"]}/simulation_log.txt'
    with open(log_file_path, 'a') as log_file:
        # Log result safely to file in CSV format
        log_data = f"{i}, {curr_loss}, " + ', '.join(map(str, curr_od))
        log_results(log_file, log_data, lock)
    
    # Return the loss and OD matrix
    return curr_od, curr_loss

def parallel_sumo_runs(train_X0, config, gt_od_df, base_path, routes_df, gt_edge_data, edge_selection):
    ods_epsilon = []
    loss_all = []
    batch_data_i = torch.ones((train_X0.shape[0], train_X0.shape[1] + 1)).to(train_X0.device) * np.nan
    train_X0_list = train_X0.tolist()

    # Set up multiprocessing manager for shared lock
    manager = mp.Manager()
    lock = manager.Lock()

    # Set up multiprocessing pool
    pool = mp.Pool(processes=mp.cpu_count())  # Use all available CPUs

    results = [pool.apply_async(run_simulation, args=(
        train_X0_list[i], 
        i, 
        config, 
        gt_od_df, 
        base_path, 
        routes_df, 
        gt_edge_data, 
        edge_selection, 
        lock)) for i in range(len(train_X0_list))]
    
    pool.close()
    pool.join()

    return ods_epsilon, loss_all


In [None]:
parallel_sumo_runs(train_X0, config, gt_od_df, base_path, routes_df, gt_edge_data, edge_selection)

In [12]:
log_file_path = f'{config["simulation_run_path"]}/simulation_log.txt'
data = pd.read_csv(log_file_path, header=None)
len_col = data.shape[1]
data.columns = ["od_index", "loss"] + [f"od_{i}" for i in range(len_col-2)]
data.iloc[:,2:] = data.iloc[:,2:].map(lambda x: round(x, 1))

In [None]:
data

In [None]:
# draw a heatmap
import seaborn as sns
import matplotlib.pyplot as plt

# data.od_0 : x-axis
# data.od_1 : y-axis
# data.loss : color
data_temp = data.copy()
# filter 250-750 and 650-1150
data_temp = data_temp[(data_temp['od_0'] >= 250) & (data_temp['od_0'] <= 750)]
data_temp = data_temp[(data_temp['od_1'] >= 650) & (data_temp['od_1'] <= 1150)]
heatmap_data = data_temp.pivot_table(index='od_0', columns='od_1', values='loss')

plt.figure(figsize=(10, 10))
sns.heatmap(heatmap_data, cmap='coolwarm', annot=False)
plt.title('NRMSE heatmap')

# save the heatmap
plt.savefig(f'{config["simulation_run_path"]}/heatmap.png')
