In [1]:
import os
import glob
import optuna
import warnings
import h5py as h5
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from copy import deepcopy

from gensit.config import Config
from gensit.inputs import Inputs
from gensit.outputs import Outputs
from gensit.utils.misc_utils import *
from gensit.utils.math_utils import *
from gensit.utils.probability_utils import *
from gensit.contingency_table import instantiate_ct
from gensit.contingency_table.MarkovBasis import instantiate_markov_basis
from gensit.static.plot_variables import LATEX_RC_PARAMETERS, COLOR_NAMES

In [2]:
%matplotlib inline

# AUTO RELOAD EXTERNAL MODULES
%load_ext autoreload
%autoreload 2

In [3]:
# LaTeX font configuration
mpl.rcParams.update(LATEX_RC_PARAMETERS)

In [4]:
# Create new logging object
logger = setup_logger(
    __name__,
    console_level = 'INFO',
    file_level = 'EMPTY'
)

In [5]:
output_path = '../../data/outputs/cambridge/exp1/paper_figures/figure3/'

# Import samples

## SIM_NN

In [6]:
# Specify experiment id
sim_nn_experiment_id = "SIM_NN_SweepedNoise_05_12_2023_21_23_00"
# Specify experiment group id
experiment_group_id = 'exp1/'
dataset = 'cambridge'
sim_nn_experiment_dir = f'../../data/outputs/{dataset}/{experiment_group_id}/{sim_nn_experiment_id}/'
sim_nn_relative_experiment_dir = os.path.relpath(sim_nn_experiment_dir,os.getcwd())

In [None]:
# Output processing settings
sim_nn_settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [
        "da.sigma==0.14142",
        "da.title=='_row_constrained'"
    ],
    "slice":True,
    "metadata_keys":[],
    "burnin_thinning_trimming": [{'iter': {"burnin":0, "thinning":1, "trimming":10000000}}],
    "sample":["intensity"],
    "group_by":[],
    "evaluation_kwargs":['intensity'],
    "filename_ending":"test",
    "force_reload":False,
    "n_workers": 1
}

In [8]:
# Initialise outputs
simnn_outputs = Outputs(
    config = os.path.join(sim_nn_relative_experiment_dir,"config.json"),
    settings = sim_nn_settings,
    inputs = None,
    slice = True,
    level = 'NOTE'
)
# Silence outputs
simnn_outputs.logger.setLevels(console_level='NOTE')
# Load all data
simnn_outputs.load()

# Get data from first sweep of the SIM_NN experiment
simnn_outputs = simnn_outputs.get(0)

49:16.882 config INFO ----------------------------------------------------------------------------------
49:16.892 config INFO Parameter space size: 
 --- seed (100)
 --- title: ['title', 'name'] (2)
 --- sigma: ['sigma', 'to_learn'] (3)
49:16.903 config INFO Total = 600.
49:16.913 config INFO ----------------------------------------------------------------------------------
49:16.952 outputs INFO //////////////////////////////////////////////////////////////////////////////////
49:16.962 outputs INFO Slicing coordinates:
49:16.972 outputs INFO sigma==0.14142
49:16.982 outputs INFO title=='_row_constrained'
49:16.993 outputs INFO iter: burnin = 0, thinning = 1, trimming = 10000000
49:17.003 outputs INFO //////////////////////////////////////////////////////////////////////////////////
49:17.024 outputs INFO Collecting samples alpha, beta, log_destination_attraction.
Collecting h5 data sequentially:   0%|          | 0/600 [00:00<?, ?it/s]49:17.118 inputs NOTE Loading Harris Wilson data 

In [9]:
inputs = Inputs(
    config = simnn_outputs.config,
    synthetic_data = False,
    logger = simnn_outputs.logger
)
inputs.cast_to_xarray()
ground_truth_table = inputs.data.ground_truth_table

52:49.776 inputs NOTE Loading Harris Wilson data ...
52:49.797 inputs NOTE Margins not provided
52:49.812 inputs NOTE Cells subset values file not provided


In [10]:
simnn_intensity = simnn_outputs.get_sample("intensity")
simnn_intensity_mean = simnn_intensity.mean(dim=['iter','seed'],dtype='float64')
simnn_intensity_flat = simnn_intensity.stack(id=['iter','seed'])
simnn_intensity_cp = coverage_probability(deepcopy(simnn_intensity_flat),ground_truth_table,dim='id')
sim_nn_residual = (ground_truth_table-simnn_intensity_mean).squeeze('sweep')
sim_nn_residual_demand = sim_nn_residual.sum('origin',dtype='float64')

52:49.877 inputs NOTE Loading Harris Wilson data ...
52:49.894 inputs NOTE Margins not provided
52:49.907 inputs NOTE Cells subset values file not provided
52:49.983 outputs PROGRESS Loaded cost_matrix sample
52:50.006 outputs PROGRESS Loaded origin_demand sample
52:50.027 outputs PROGRESS Loaded grand_total sample
52:50.048 __init__ PROGRESS Building 
            69x13 ProductionConstrained Spatial Interaction Model
            Origin demand sum: 0.9999999403953552
            Destination demand sum: None
            Origin attraction sum: None
            Destination attraction sum: None
            Cost matrix sum: 358.17156982421875
            Alpha: 1.0
            Beta: 1.0
            Beta scaling: None
        
52:50.066 outputs PROGRESS Loaded alpha sample
52:50.083 outputs PROGRESS Loaded log_destination_attraction sample
52:50.543 outputs PROGRESS Loaded intensity sample


In [11]:
print(simnn_intensity.sweep.values)

[(0.14142000675201416, "['alpha', 'beta']", '_row_constrained', 'ProductionConstrained')]


In [12]:
sim_nn_residual.values.min(),sim_nn_residual.values.max()

(-75.01730598461151, 144.96222004364012)

## SIM_MCMC

In [13]:
sim_mcmc_experiment_id = "SIM_MCMC_SweepedNoise_16_05_2023_20_09_04"
# Specify experiment group id
experiment_group_id = 'exp1/'
dataset = 'cambridge'
sim_mcmc_experiment_dir = f'../../data/outputs/{dataset}/{experiment_group_id}/{sim_mcmc_experiment_id}/'
sim_mcmc_relative_experiment_dir = os.path.relpath(sim_mcmc_experiment_dir,os.getcwd())

In [14]:
sim_mcmc_settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [
        "da.sigma==0.14142",
        "da.title=='_row_constrained'"
    ],
    "slice":True,
    "metadata_keys":[],
    "burnin_thinning_trimming": [{'iter': {"burnin":0, "thinning":1, "trimming":10000000}}],
    "sample":["intensity"],
    "group_by":[],
    "evaluation_kwargs":['intensity'],
    "filename_ending":"test",
    "force_reload":False,
    "n_workers": 1
}

In [15]:
# Initialise outputs
sim_mcmc_outputs = Outputs(
    config = os.path.join(sim_mcmc_relative_experiment_dir,"config.json"),
    settings = sim_mcmc_settings,
    inputs = None,
    slice = True,
    level = 'NOTE'
)
# Silence outputs
sim_mcmc_outputs.logger.setLevels(console_level='NOTE')
# Load all data
sim_mcmc_outputs.load()

# Get data from first sweep of the SIM_MCMC experiment
sim_mcmc_outputs = sim_mcmc_outputs.get(0)

52:53.054 config INFO ----------------------------------------------------------------------------------
52:53.065 config INFO Parameter space size: 
 --- title: ['title', 'name'] (2)
 --- sigma: ['sigma', 'to_learn', 'covariance'] (2)
52:53.076 config INFO Total = 4.
52:53.086 config INFO ----------------------------------------------------------------------------------
52:53.126 outputs INFO //////////////////////////////////////////////////////////////////////////////////
52:53.136 outputs INFO Slicing coordinates:
52:53.146 outputs INFO sigma==0.14142
52:53.156 outputs INFO title=='_row_constrained'
52:53.166 outputs INFO iter: burnin = 0, thinning = 1, trimming = 10000000
52:53.176 outputs INFO //////////////////////////////////////////////////////////////////////////////////
52:53.186 outputs INFO Reading samples alpha, beta, log_destination_attraction, sign.
52:53.224 outputs PROGRESS Slicing alpha
52:53.234 outputs PROGRESS Before coordinate slicing alpha[0]: {'iter': 100000}
5

In [16]:
sim_mcmc_intensity = sim_mcmc_outputs.get_sample("intensity")
sim_mcmc_intensity_mean = sim_mcmc_intensity.mean(dim=['iter'],dtype='float64')
sim_mcmc_residual = (ground_truth_table-sim_mcmc_intensity_mean).squeeze('sweep')
sim_mcmc_residual_demand = sim_mcmc_residual.sum('origin',dtype='float64')
sim_mcmc_intensity_cp = coverage_probability(deepcopy(sim_mcmc_intensity),ground_truth_table,dim='iter')

52:53.605 inputs NOTE Loading Harris Wilson data ...
52:53.620 inputs NOTE Margins not provided
52:53.631 inputs NOTE Cells subset values file not provided
52:53.665 outputs PROGRESS Loaded cost_matrix sample
52:53.676 outputs PROGRESS Loaded origin_demand sample
52:53.686 outputs PROGRESS Loaded grand_total sample
52:53.696 __init__ PROGRESS Building 
            69x13 ProductionConstrained Spatial Interaction Model
            Origin demand sum: 0.9999999403953552
            Destination demand sum: None
            Origin attraction sum: None
            Destination attraction sum: None
            Cost matrix sum: 1.0
            Alpha: 0.5
            Beta: 0.5
            Beta scaling: None
        
52:53.707 outputs PROGRESS Loaded alpha sample
52:53.717 outputs PROGRESS Loaded log_destination_attraction sample
52:54.156 outputs PROGRESS Loaded intensity sample


In [17]:
sim_mcmc_residual.sweep.values

array((0.14142000675201416, "['alpha', 'beta']", '[[1.0, 0.0], [0.0, 1.0]]', '_row_constrained', 'ProductionConstrained'),
      dtype=object)

## SIT_MCMC

In [18]:
# Specify experiment id
sit_mcmc_experiment_id = "JointTableSIM_MCMC_SweepedNoise_16_05_2023_20_09_04"
# Specify experiment group id
experiment_group_id = 'exp1/'
dataset = 'cambridge'
sit_mcmc_experiment_dir = f'../../data/outputs/{dataset}/{experiment_group_id}/{sit_mcmc_experiment_id}/'
sit_mcmc_relative_experiment_dir = os.path.relpath(sit_mcmc_experiment_dir,os.getcwd())

sit_mcmc_settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [
        "da.sigma==0.14142",
        "da.title=='_row_constrained'"
    ],
    "slice":True,
    "metadata_keys":[],
    "burnin_thinning_trimming": [{'iter': {"burnin":0, "thinning":1, "trimming":10000000}}],
    "sample":["table"],
    "group_by":[],
    "evaluation_kwargs":['table'],
    "filename_ending":"test",
    "force_reload":False,
    "n_workers": 1
}

In [19]:
# Initialise outputs
sit_mcmc_outputs = Outputs(
    config = os.path.join(sit_mcmc_relative_experiment_dir,"config.json"),
    settings = sit_mcmc_settings,
    inputs = None,
    slice = True,
    level = 'NOTE'
)
# Silence outputs
sit_mcmc_outputs.logger.setLevels(console_level='NOTE')
# Load all data
sit_mcmc_outputs.load()
sit_mcmc_outputs = sit_mcmc_outputs.get(0)

52:57.129 config INFO ----------------------------------------------------------------------------------
52:57.138 config INFO Parameter space size: 
 --- title: ['title', 'axes', 'cells', 'name', 'proposal'] (7)
 --- sigma: ['sigma', 'to_learn', 'covariance'] (2)
52:57.148 config INFO Total = 14.
52:57.158 config INFO ----------------------------------------------------------------------------------
52:57.198 outputs INFO //////////////////////////////////////////////////////////////////////////////////
52:57.208 outputs INFO Slicing coordinates:
52:57.218 outputs INFO sigma==0.14142
52:57.227 outputs INFO title=='_row_constrained'
52:57.237 outputs INFO iter: burnin = 0, thinning = 1, trimming = 10000000
52:57.247 outputs INFO //////////////////////////////////////////////////////////////////////////////////
52:57.258 outputs INFO Reading samples table.
52:57.498 outputs PROGRESS Slicing table
52:57.508 outputs PROGRESS Before coordinate slicing table[0]: {'origin': 69, 'destination'

In [20]:
# Get data from first sweep of the JointTableSIM_MCMC experiment
sit_mcmc_table = sit_mcmc_outputs.get_sample("table")
sit_mcmc_table_mean = sit_mcmc_table.mean(dim=['iter'],dtype='float64')
sit_mcmc_residual = (ground_truth_table-sit_mcmc_table_mean).squeeze('sweep')
sit_mcmc_residual_demand = sit_mcmc_residual.sum('origin',dtype='float64')
sit_mcmc_table_cp = coverage_probability(deepcopy(sit_mcmc_table),ground_truth_table,dim='iter')

52:58.870 inputs NOTE Loading Harris Wilson data ...
52:58.882 inputs NOTE Margins not provided
52:58.892 inputs NOTE Cells subset values file not provided
52:58.903 outputs PROGRESS Loaded table sample


In [21]:
sit_mcmc_residual.sweep.values

array((0.14142000675201416, "['alpha', 'beta']", '[[1.0, 0.0], [0.0, 1.0]]', '_row_constrained', '[[1]]', 'None', 'ProductionConstrained', 'direct_sampling'),
      dtype=object)

## All three methods

In [57]:
print(sim_mcmc_residual.values.min(),sim_mcmc_residual.values.max())
print(
    "SIM MCMC \n",
    "SRMSE",srmse(sim_mcmc_intensity_mean,ground_truth_table).values[0], 
    "CP",sim_mcmc_intensity_cp.mean(['origin','destination'],dtype='float64').values[0]
)

-119.0084510760498 290.6952456342316
SIM MCMC 
 SRMSE 0.6195762415103968 CP 0.31438127090301005


In [58]:
print(sit_mcmc_residual.values.min(),sit_mcmc_residual.values.max())
print(
    "Joint Table SIM MCMC \n",
    "SRMSE",srmse(sit_mcmc_table_mean,ground_truth_table).values[0],
    "CP",sit_mcmc_table_cp.mean(['origin','destination'],dtype='float64').values[0]
)

-122.04723999999999 291.41227000000003
Joint Table SIM MCMC 
 SRMSE 0.615262826311941 CP 0.5797101449275363


In [59]:
print(sim_nn_residual.values.min(),sim_nn_residual.values.max())
print(
    "SIM NN \n",
    "SRMSE",srmse(simnn_intensity_mean,ground_truth_table).values[0],
    "CP",simnn_intensity_cp.mean(['origin','destination'],dtype='float64').values[0]
)

-75.01730598461151 144.96222004364012
SIM NN 
 SRMSE 0.415653454189442 CP 0.9208472686733556


In [25]:
ZERO_COLOR = 0.2954

In [46]:
FLOW_MIN_VAL = min(sim_mcmc_residual.values.min(),sim_nn_residual.values.min(),sit_mcmc_residual.values.min())
FLOW_MAX_VAL = max(sim_mcmc_residual.values.max(),sim_nn_residual.values.max(),sit_mcmc_residual.values.max())
print(FLOW_MIN_VAL,FLOW_MAX_VAL)

-122.04723999999999 291.41227000000003


In [47]:
DEMAND_MIN_VAL = min(
    sim_mcmc_residual.sum('origin',dtype='float64').values.min(),
    sim_nn_residual.sum('origin',dtype='float64').values.min(),
    sit_mcmc_residual.sum('origin',dtype='float64').values.min()
)
DEMAND_MAX_VAL = max(
    sim_mcmc_residual.sum('origin',dtype='float64').values.max(),
    sim_nn_residual.sum('origin',dtype='float64').values.max(),
    sit_mcmc_residual.sum('origin',dtype='float64').values.max()
)
print(DEMAND_MIN_VAL,DEMAND_MAX_VAL)

-961.2746893793677 1217.6436098052407


In [28]:
def normalise_to_flow_colorscale(value):
    return (value - FLOW_MIN_VAL) / (FLOW_MAX_VAL - FLOW_MIN_VAL)
def normalise_to_demand_colorscale(value):
    return (value - DEMAND_MIN_VAL) / (DEMAND_MAX_VAL - DEMAND_MIN_VAL)

def write_mixed_numpy_to_dat(filename: str, array: np.ndarray, header: list[str] = None, delimiter: str = '\t', dtypes:list = []) -> None:
    """
    Save a NumPy array of mixed types (object dtype) to a .dat file.

    Parameters
    ----------
    filename : str
        Path to the .dat file.
    array : np.ndarray
        A 2D NumPy array with mixed types (dtype=object or structured).
    header : list of str, optional
        List of column headers (default: None).
    delimiter : str
        Delimiter between fields (default: tab).

    Raises
    ------
    ValueError
        If the array is not 2D or has inconsistent row lengths.
    """
    try:
        if array.ndim != 2:
            raise ValueError("Only 2D arrays are supported.")

        with open(filename, 'w', encoding='utf-8') as f:
            if header:
                if len(header) != array.shape[1]:
                    raise ValueError("Header length does not match number of columns.")
                f.write(delimiter.join(header) + '\n')

            for row in array:
                formatted = delimiter.join(str(dtypes[k](cell)) for k,cell in enumerate(row))
                f.write(formatted + '\n')

    except Exception as e:
        frame = inspect.currentframe()
        info = inspect.getframeinfo(frame)
        logger.error(
            f"Failed to write .dat file '{filename}' at {info.filename}:{info.lineno} in {info.function}()",
            exc_info=True
        )
        raise

In [29]:
sim_nn_filename = "intensity_mean_heatmap_NeuralABM_low_noise_row_margin_thinning1_burnin10000"


sim_nn_residual_arr = []
for i in range(sim_nn_residual.sizes['origin']+1):
    for j in range(sim_nn_residual.sizes['destination']+1):
        if i == 0 or j == 0:
            sim_nn_residual_arr.append([i,j,0,ZERO_COLOR])
        else:
            sim_nn_residual_arr.append([i,j,sim_nn_residual.values[i-1,j-1],normalise_to_flow_colorscale(sim_nn_residual.values[i-1,j-1])])
write_mixed_numpy_to_dat(
    os.path.join(output_path,sim_nn_filename+"_cell_data.dat"), 
    np.array(sim_nn_residual_arr), 
    header=["x","y","z","color"],
    dtypes=[int,int,float,float]
)

sim_nn_residual_demand_arr = []
for j in range(sim_nn_residual_demand.sizes['destination']):
    sim_nn_residual_demand_arr.append([69,j,sim_nn_residual_demand.values[j],normalise_to_demand_colorscale(sim_nn_residual_demand.values[j])])
for j in range(sim_nn_residual_demand.sizes['destination']):
    sim_nn_residual_demand_arr.append([70,j,sim_nn_residual_demand.values[j],normalise_to_demand_colorscale(sim_nn_residual_demand.values[j])])
write_mixed_numpy_to_dat(
    os.path.join(output_path,sim_nn_filename+"_destination_demand_cell_data.dat"), 
    np.array(sim_nn_residual_demand_arr), 
    header=["x","y","z","color"],
    dtypes=[int,int,float,float]
)

sim_nn_cps = []
for i in range(simnn_intensity_cp.sizes['origin']):
    for j in range(simnn_intensity_cp.sizes['destination']):
        if simnn_intensity_cp.values[i,j]:
            sim_nn_cps.append([i,j])
write_mixed_numpy_to_dat(
    os.path.join(output_path,sim_nn_filename+"_covered_cell_coordinates.dat"), 
    np.array(sim_nn_cps), 
    header=["x","y"],
    dtypes=[int,int]
)

In [30]:
sim_mcmc_filename = "intensity_mean_heatmap_SIMLatentMCMC_high_noise_row_margin_thinning1_burnin10000"

sim_mcmc_residual_arr = []
for i in range(sim_mcmc_residual.sizes['origin']):
    for j in range(sim_mcmc_residual.sizes['destination']):
        sim_mcmc_residual_arr.append([i,j,sim_mcmc_residual.values[i,j],normalise_to_flow_colorscale(sim_mcmc_residual.values[i,j])])
np.savetxt(
    os.path.join(output_path,sim_mcmc_filename+"_cell_data.txt"), 
    np.array(sim_mcmc_residual_arr), 
    header="x y z color", 
    comments='', 
    fmt="%.6f"
)
sim_mcmc_residual_demand_arr = []
for j in range(sim_mcmc_residual_demand.sizes['destination']):
    sim_mcmc_residual_demand_arr.append([69,j,sim_mcmc_residual_demand.values[j],normalise_to_demand_colorscale(sim_mcmc_residual_demand.values[j])])
for j in range(sim_mcmc_residual_demand.sizes['destination']):
    sim_mcmc_residual_demand_arr.append([70,j,sim_mcmc_residual_demand.values[j],normalise_to_demand_colorscale(sim_mcmc_residual_demand.values[j])])
np.savetxt(
    os.path.join(output_path,sim_mcmc_filename+"_destination_demand_cell_data.txt"), 
    np.array(sim_mcmc_residual_demand_arr), 
    header="x y z color", 
    comments='', 
    fmt="%.6f"
)
sim_mcmc_cps = []
for i in range(sim_mcmc_intensity_cp.sizes['origin']):
    for j in range(sim_mcmc_intensity_cp.sizes['destination']):
        if sim_mcmc_intensity_cp.values[i,j]:
            sim_mcmc_cps.append([i,j])
np.savetxt(
    os.path.join(output_path,sim_mcmc_filename+"_covered_cell_coordinates.txt"), 
    np.array(sim_mcmc_cps), 
    header="x y", 
    comments='', 
    fmt="%.6f"
)

In [31]:
sit_mcmc_filename = "table_mean_heatmap_JointTableSIMLatentMCMC_high_noise_row_margin_thinning1_burnin10000"

sit_mcmc_residual_arr = []
for i in range(sit_mcmc_residual.sizes['origin']):
    for j in range(sit_mcmc_residual.sizes['destination']):
        sit_mcmc_residual_arr.append([i,j,sit_mcmc_residual.values[i,j],normalise_to_flow_colorscale(sit_mcmc_residual.values[i,j])])
np.savetxt(
    os.path.join(output_path,sit_mcmc_filename+"_cell_data.txt"), 
    np.array(sit_mcmc_residual_arr), 
    header="x y z color", 
    comments='', 
    fmt="%.6f"
)
sit_mcmc_residual_demand_arr = []
for j in range(sit_mcmc_residual_demand.sizes['destination']):
    sit_mcmc_residual_demand_arr.append([69,j,sit_mcmc_residual_demand.values[j],normalise_to_demand_colorscale(sit_mcmc_residual_demand.values[j])])
for j in range(sit_mcmc_residual_demand.sizes['destination']):
    sit_mcmc_residual_demand_arr.append([70,j,sit_mcmc_residual_demand.values[j],normalise_to_demand_colorscale(sit_mcmc_residual_demand.values[j])])
np.savetxt(
    os.path.join(output_path,sit_mcmc_filename+"_destination_demand_cell_data.txt"), 
    np.array(sit_mcmc_residual_demand_arr), 
    header="x y z color", 
    comments='', 
    fmt="%.6f"
)
sit_mcmc_cps = []
for i in range(sit_mcmc_table_cp.sizes['origin']):
    for j in range(sit_mcmc_table_cp.sizes['destination']):
        if sit_mcmc_table_cp.values[i,j]:
            sit_mcmc_cps.append([i,j])
np.savetxt(
    os.path.join(output_path,sit_mcmc_filename+"_covered_cell_coordinates.txt"), 
    np.array(sit_mcmc_cps), 
    header="x y", 
    comments='', 
    fmt="%.6f"
)