In [None]:
import torch
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

from torch.quasirandom import SobolEngine

from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.kernels import MaternKernel, ScaleKernel
from gpytorch.constraints import Interval

from botorch import fit_gpytorch_mll
from botorch.utils.transforms import unnormalize, normalize
from botorch.models import SingleTaskGP
from botorch.models.transforms import Standardize
from botorch.optim import optimize_acqf
from botorch.acquisition import qLogExpectedImprovement
from botorch.sampling.stochastic_samplers import StochasticSampler

from helpers import (load_experiment_metadata, 
                    compute_nrmse_counts_all_edges, 
                    parse_loop_data_xml_to_pandas, 
                    create_taz_xml,
                    simulate_od,
                    od_xml_to_df,
                    xml2df_str)


In [None]:
!Pwd

In [None]:
# base_path = "/home/bench/Gitsrcs/origin_destination_bayes_opt"
# base_path = "/Users/osorio/HEC/Research/Group/FacultyCollaborations/SeongjinChoi_UMN/Code_BO/origin_destination_bayes_opt-main"
base_path = "/Users/chois/Gitsrcs/origin_destination_bayes_opt"
os.chdir(base_path)

In [None]:
config_path = Path(base_path , 'config')
print(f"config_path: {config_path}")

config, sim_setup = load_experiment_metadata(config_path)

network_name = sim_setup['network_name']
model_name = sim_setup['model_name']

network_path = Path("network" , network_name)
taz2edge_xml = Path(base_path, network_path, 'taz.xml')
net_xml = Path(base_path, network_path, 'net.xml')
fixed_routes = Path(base_path, network_path, 'routes.csv')
file_gt_od = Path(base_path, network_path, 'od.xml')
additional_xml = Path(base_path, network_path, 'additional.xml')

out_path = f"output/{network_name}_{model_name}" 
Path(out_path).mkdir(parents=True, exist_ok=True)

gt_version_str = network_name           ## TODO : need to check if this is correct

EDGE_OUT_STR = f'edge_data_{network_name}.xml'
# suffix of simulation output edge file
TRIPS2ODS_OUT_STR = 'trips.xml'
SUMO_PATH = config["SUMO"]

sim_start_time = sim_setup['sim_start_time']
sim_end_time = sim_setup['sim_end_time']
sim_stat_freq_sec = sim_setup['sim_stat_freq_sec']
od_duration_sec = sim_setup['od_duration_sec']

n_init_search = sim_setup['n_init_search']

NITER = sim_setup["BO_niter"]
BATCH_SIZE = sim_setup["BO_batch_size"]
NUM_RESTARTS = sim_setup["BO_num_restarts"]
RAW_SAMPLES = sim_setup["BO_raw_samples"] 



In [None]:

# # taz2edge_xml = 'taz_new.xml'
# # net_xml = 'SFO.net.xml'
# # fixed_routes_xml = f'{base_path}/5hr_route_choice_set.csv'
# # od_duration_seconds = 5*60 

# # # duration of sample time for simulation output statistics
# # simulation_stat_freq_sec = od_duration_seconds
# # sim_end_time = od_duration_seconds
# # additional_xml = f'additional.add_statfreq{od_duration_seconds}.xml'

# # # suffix of simulation output edge file
# # EDGE_OUT_STR = 'edge_data_SFO.xml'
# # TRIPS2ODS_OUT_STR = 'trips.xml'
# # SUMO_PATH = '/usr/local/opt/sumo/share/sumo'

# od_duration_seconds = 30*60 

# # duration of sample time for simulation output statistics
# simulation_stat_freq_sec = od_duration_seconds
# sim_end_time = od_duration_seconds

# # TODO: it might be cleaner to replace this with a config file, i attached to my email an example. and one can define one config file per network. 
# network_name = "quickstart"
# model_name = "bo_vanilla"

# network_path = f"network/{network_name}"
# taz2edge_xml = f"{base_path}/{network_path}/taz.xml"
# net_xml = f"{base_path}/{network_path}/net.xml"
# fixed_routes = f"{base_path}/{network_path}/routes.csv"
# # od_xml = f"{network_path}/od.xml"       ## TODO : need to check if this is correct
# file_gt_od = f"{base_path}/{network_path}/od.xml"      ## TODO : need to check if this is correct
# # file_gt_edges                         ## TODO : need to check if this is necessary (not being used below)
# additional_xml = f"{base_path}/{network_path}/additional.xml"
# out_path = f"output/{network_name}_{model_name}"
# out_path = f"output/{network_name}_{model_name}"       ## TODO : need to check if this is correct
# # prefix_output = f"{out_path}/out"     ## TODO : need to check if this is correct
# gt_version_str = network_name           ## TODO : need to check if this is correct

# EDGE_OUT_STR = f'edge_data_{network_name}.xml'
# # suffix of simulation output edge file
# TRIPS2ODS_OUT_STR = 'trips.xml'
# # TODO I changed this path for it to work for me.
# SUMO_PATH = '/opt/homebrew/opt/sumo/share/sumo'
# #SUMO_PATH = "/usr/share/sumo"

# Path(out_path).mkdir(parents=True, exist_ok=True)




In [None]:
gt_version_str = 'v4'

# gt v4:
mean_od_val = 100
num_ods = 10

print('if you want to optimize them all (~86k) set num_ods as defined in commented line below')
#num_ods = routes_df.shape[0]

In [None]:
# od_xml = f'gt_od_{gt_version_str}.xml'
# file_gt = f'{base_path}/gt_od_{gt_version_str}.xml'
# file_gt_edges = f'{base_path}/gt_edges_{gt_version_str}.csv'
# prefix_output_gt = f'gt_{gt_version_str}'

In [None]:
# Get GT OD
print("Reading:",file_gt_od)
tree = ET.parse(file_gt_od)
root = tree.getroot()
gt_od_df =  xml2df_str(root, 'tazRelation')

gt_od_df.head()

In [None]:
print("Reading:",fixed_routes)
routes_df = pd.read_csv(fixed_routes, index_col=0)

In [None]:
gt_od_df = od_xml_to_df(file_gt_od)

In [None]:
gt_od_df.columns

## Vanilla BO


### Declare parameter space


In [None]:
# TODO: let's put all import  statements at the top of the notebook

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
dtype = torch.double

### Declare search space
# dimensionality of input space

dim_od = gt_od_df.shape[0]

#bounds = torch.tensor([
#    [ gt_od_df['count'].astype(float).min() - 2 for _ in range(dim_od)],
#    [ gt_od_df['count'].astype(float).max() + 2 for _ in range(dim_od)]
#], device=device, dtype=dtype) 

bounds = torch.tensor([
    [ 0 for _ in range(dim_od)],
    [ 2000 for _ in range(dim_od)]
], device=device, dtype=dtype) 


bounds



Run GT simulation


In [None]:
simulation_run_path =f'{out_path}'
simulation_gt_run_path =f'{out_path}/ground_truth'
Path(simulation_gt_run_path).mkdir(parents=True, exist_ok=True)
prefix_output_gt = f'{simulation_gt_run_path}/sim'

sim_edge_out_gt = f'{prefix_output_gt}_{EDGE_OUT_STR}'
new_od_xml = f'{simulation_gt_run_path}/od.xml'

base_od = gt_od_df.copy()
gt_od_vals = gt_od_df['count'].astype(float).to_numpy()
curr_od = gt_od_vals.copy()
base_od['count'] = curr_od
base_od = base_od.rename(columns={'fromTaz':'from', 'toTaz':'to'})        
create_taz_xml(new_od_xml, base_od, od_duration_sec, base_path)

print(base_od)

# Run simulation

simulate_od(new_od_xml, 
            prefix_output_gt, 
            base_path, 
            net_xml, 
            taz2edge_xml, 
            additional_xml,
            routes_df,
            sim_end_time,
            TRIPS2ODS_OUT_STR)



Read output of GT simulation


In [None]:
df_edge_gt, _, _ = parse_loop_data_xml_to_pandas(base_path, sim_edge_out_gt, prefix_output_gt,SUMO_PATH)
# picking at edges as GT edges
num_gt_edges = df_edge_gt.shape[0]
print("Number of GT edges:",num_gt_edges)
gt_edge_data = df_edge_gt\
    .sort_values(by=['interval_nVehContrib'], ascending=False)\
    .iloc[:num_gt_edges]

# gt_edge_data.shape


In [None]:
gt_edge_data

In [None]:
bounds.shape

In [None]:
# full grid search
n_full_search = 21
candidates = []

# print(dim_od)
for i in range(dim_od):
    candidates.append(torch.linspace(0,1,n_full_search))

search_space = torch.meshgrid(candidates)
search_space = torch.stack(search_space , 0)
search_space.shape
search_space = search_space.view(dim_od, -1)
search_space = search_space.transpose(0,1)
print(f"search_space shape = {search_space.shape}")

# map the normalized into the original parameter space
train_X0 = unnormalize(search_space, bounds)
train_X0 = train_X0[1:,:]
train_X0

In [None]:
#num_epsilon_iter = 2
ods_epsilon = []
loss_all = []
batch_data_i = []

# Base OD which we will update their count entries
base_od = gt_od_df.copy()
gt_od_vals = gt_od_df['count'].astype(float).to_numpy()

for i in tqdm(range(train_X0.shape[0])):
      x = train_X0[i]
#for i , x in enumerate(
#      [[ 94.66438596,  91.97375804, 101.82277249, 112.44778006,
#            105.33019264,  92.62166575,  99.8673423 ,  93.71928772,
#            116.16658554,  94.79717515],
#      [ 97.4, 114.9, 104.1, 100. , 109.1, 106.7,  87.8, 101.1, 113.9,109.4]]):
      print(f"########### OD: {i} ###########")
      print(x)
      
      Path(f'{simulation_run_path}/full_search').mkdir(parents=True, exist_ok=True)
      new_od_xml = f'{simulation_run_path}/full_search/gt_od_{gt_version_str}_{i}.xml'
      prefix_output_init = f'{simulation_run_path}/full_search/fullsearch_{i}'

      # Generate OD
      #curr_od = gt_od_vals.copy()
      curr_od = np.array(x)

      print(f'total expected GT demand: {np.sum(curr_od)}')

      ###
      # create OD xml file 
      ###
      base_od['count'] = curr_od
      # round to 1 decimal point
      base_od['count'] = [round(elem, 1) for elem in base_od['count']]     
      base_od = base_od.rename(columns={'fromTaz':'from', 'toTaz':'to'})        
      create_taz_xml(new_od_xml, base_od, od_duration_sec, base_path)
      ods_epsilon.append(curr_od)

      # simulate gt od
      simulate_od(new_od_xml, 
                  prefix_output_init, 
                  base_path, 
                  net_xml, 
                  taz2edge_xml, 
                  additional_xml, 
                  routes_df,
                  sim_end_time,
                  TRIPS2ODS_OUT_STR)

      ## Compute loss
      #prefix_output = f'full_search/sobol_{i}'
      sim_edge_out = f'{base_path}/{prefix_output_init}_{EDGE_OUT_STR}'
      print(sim_edge_out)
      curr_loop_stats, _, _ = parse_loop_data_xml_to_pandas(base_path, sim_edge_out,prefix_output_init,SUMO_PATH)
      curr_loss = compute_nrmse_counts_all_edges(gt_edge_data, curr_loop_stats)

      loss_all.append(curr_loss)
      print(f"############## loss: {curr_loss} ##############")

      # Parse training data
      df_curr = pd.DataFrame(curr_od.reshape(1,dim_od),
                        columns = [f"x_{i+1}" for i in range(dim_od)])
      df_curr['loss'] = curr_loss
      batch_data_i.append(df_curr)



In [None]:
df_initial_bo = pd.concat(batch_data_i)
df_initial_bo.head()

In [None]:
simulation_run_path

In [None]:
# Save initial dataset
df_initial_bo.to_csv(f"{simulation_run_path}/full_search/data_set_ods_0_2000.csv",index=None)
