In [1]:
import os
import glob
import warnings
import h5py as h5
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from copy import deepcopy
from tqdm.auto import tqdm
from IPython.core.display import display, HTML

from gensit.config import Config
from gensit.inputs import Inputs
from gensit.outputs import Outputs
from gensit.utils.misc_utils import *
from gensit.utils.math_utils import *

from gensit.utils.probability_utils import *
from gensit.contingency_table import instantiate_ct
from gensit.contingency_table.contingency_table_mcmc import ContingencyTableMarkovChainMonteCarlo

  from IPython.core.display import display, HTML


In [2]:
%matplotlib inline

# AUTO RELOAD EXTERNAL MODULES
%load_ext autoreload
%autoreload 2

## Import samples

In [12]:
# Specify experiment id
experiment_id = "GBRT_Comparison_UnsetNoise__doubly_and_cell_constrained_25_04_2024_18_41_52"
# Specify experiment group id
dataset = 'DC'
#'DC'
#'cambridge_work_commuter_lsoas_to_msoas'
#'sioux_falls'
experiment_group_id = 'comparisons'
# 'r_squared'
# 'exp1'
# 'comparisons'
experiment_dir = f'../data/outputs/{dataset}/{experiment_group_id}/{experiment_id}/'
relative_experiment_dir = os.path.relpath(experiment_dir,os.getcwd())

# Create new logging object
logger = setup_logger(
    __name__,
    console_level = 'PROGRESS',
    file_level = 'EMPTY'
)

In [13]:
# Output processing settings
settings = {
    "logging_mode": "PROGRESS",
    "coordinate_slice": [
        # "da.loss_name.isin([str(['dest_attraction_ts_likelihood_loss']),str(['dest_attraction_ts_likelihood_loss', 'table_likelihood_loss']),str(['table_likelihood_loss'])])"
        # "da.loss_name == str(['dest_attraction_ts_likelihood_loss'])",
        # "da.cost_matrix == 'cost_matrix_max_normalised.txt'"
    ],
    # "coordinate_slice": [
    #     "da.destination_attraction_ts == 'destination_attraction_housing_units_ts_sum_normalised.txt'",
    #     "da.cost_matrix == 'cost_matrix_sum_normalised.txt'",
    #     "da.title == '_row_constrained'",
    #     "da.bmax == 1.0"
    #     # "da.loss_name == str(['dest_attraction_ts_likelihood_loss'])",
    #     # "~da.title.isin([str('_unconstrained'), str('_total_constrained')])"
    # ],
    "metadata_keys":[],
    "burnin_thinning_trimming": [],
    # "burnin_thinning_trimming": [{'iter': {"burnin":10000, "thinning":90, "trimming":1000}}],
    "n_workers": 1,
    "group_by":[],
    "filename_ending":"test",
    "sample":["intensity"],
    "force_reload":False
}

In [14]:
# Initialise outputs
current_sweep_outputs = Outputs(
    config = os.path.join(relative_experiment_dir,"config.json"),
    settings = settings,
    inputs = None,
    slice = True,
    level = 'INFO'
)
# Silence outputs
# current_sweep_outputs.logger.setLevels(console_level='EMPTY')
# Load all data
current_sweep_outputs.load()

print(len(current_sweep_outputs.data),'experiments matched')

1 experiments matched


In [None]:
# conf = Config(
#     path = os.path.join(relative_experiment_dir,"config.json")
# )
# ins = Inputs(
#     config = conf
# )
# ins.cast_to_xarray()

# $R^2$ analysis

In [None]:
sweep_outputs_slices = []
for i in tqdm(range(len(current_sweep_outputs.data)),leave=False,desc='Finding best R2 experiments'):
    current_sweep_outputs_slice = current_sweep_outputs.get(i)
    current_r2 = current_sweep_outputs_slice.data.r2
    if np.max(current_r2) > 0.6:
        sweep_outputs_slices.append(current_sweep_outputs_slice)
print(len(sweep_outputs_slices),'experiments kept')

In [None]:
# index = 15
# sweep_outputs_slice = sweep_outputs_slices[index]

In [None]:
# data_index = 0
# sweep_outputs_slice = current_sweep_outputs.get(data_index)

In [None]:
r2 = sweep_outputs_slice.data.r2
alpha_range = current_sweep_outputs.config['experiments'][0]['grid_ranges']['alpha']
r2['alpha_range'] = np.linspace(alpha_range['min'],alpha_range['max'],alpha_range['n'],endpoint=True)
r2['alpha_range'] = r2['alpha_range'].values
beta_range = current_sweep_outputs.config['experiments'][0]['grid_ranges']['beta']
r2['beta_range'] = np.linspace(beta_range['min'],beta_range['max'],beta_range['n'],endpoint=True)
r2['beta_range'] = r2['beta_range'].values
r2.coords

In [None]:
argmax_index = np.unravel_index(np.argmax(r2.values.squeeze()), np.shape(r2.values.squeeze()))
plt.figure(figsize=(20,20))
plt.imshow(r2, cmap='RdYlGn', interpolation='nearest')
plt.scatter(argmax_index[1],argmax_index[0],marker='x',color='black',s=500)
plt.yticks(ticks=range(len(r2['alpha_range'])),labels=np.round(r2['alpha_range'].values,2))
plt.ylabel('alpha')
plt.xticks(ticks=range(len(r2['beta_range'])),labels=np.round(r2['beta_range'].values,2))
plt.xlabel('beta')
for i in range(len(r2['alpha_range'])):
    for j in range(len(r2['beta_range'])):
        plt.text(j,i,s=np.round(r2.squeeze().values[i,j],2),fontsize=8)
plt.show()

# SIM Analysis

In [16]:
index = 0
current_data = current_sweep_outputs.get(index)
print('# Sweeps:',len(current_sweep_outputs.data))
print(current_data.data.intensity.coords.items())

# Sweeps: 1
ItemsView(Coordinates:
  * origin       (origin) int16 1 2 3 4 5 6 7 8 ... 173 174 175 176 177 178 179
  * destination  (destination) int16 1 2 3 4 5 6 7 ... 174 175 176 177 178 179
  * id           (id) object MultiIndex
  * iter         (id) int32 1)


In [17]:
ins = Inputs(
    config = current_data.config
)
ins.cast_to_xarray()
test_cells = read_file('../data/inputs/DC/test_cells.txt').astype('int32')
train_cells = read_file('../data/inputs/DC/train_cells.txt').astype('int32')

In [21]:
all_table_error = srmse(
    prediction = current_data.data.table.mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table
)
train_table_error = srmse(
    prediction = current_data.data.table.mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table,
    cells = train_cells
)
test_table_error = srmse(
    prediction = current_data.data.table.mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table,
    cells = test_cells
)

In [None]:
print(
    all_table_error.values.squeeze().item(),
    train_table_error.values.squeeze().item(),
    test_table_error.values.squeeze().item()
)

In [22]:
all_intensity_error = srmse(
    prediction = current_data.get_sample('intensity').mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table
)
train_intensity_error = srmse(
    prediction = current_data.get_sample('intensity').mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table,
    cells = train_cells
)
test_intensity_error = srmse(
    prediction = current_data.get_sample('intensity').mean('id',dtype='float64'),
    ground_truth = ins.data.ground_truth_table,
    cells = test_cells
)

In [23]:
print(
    all_intensity_error.values.squeeze().item(),
    train_intensity_error.values.squeeze().item(),
    test_intensity_error.values.squeeze().item()
)

2.667843536404041 0.0 2.297288107211039


In [None]:
all_table_cp = coverage_probability(
    prediction = current_data.data.table,
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95
)
train_table_cp = coverage_probability(
    prediction = current_data.data.table,
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95,
    cells = train_cells
)
test_table_cp = coverage_probability(
    prediction = current_data.data.table,
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95,
    cells = test_cells
)


In [None]:
all_cp = all_table_cp
test_cp = train_table_cp
test_cp = test_table_cp

In [None]:
print(
    all_table_cp.mean(['origin','destination'],skipna=True).values.item(),
    train_table_cp.mean(['origin','destination'],skipna=True).values.item(),
    test_table_cp.mean(['origin','destination'],skipna=True).values.item()
)

In [None]:
all_intensity_cp = coverage_probability(
    prediction = current_data.get_sample('intensity'),
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95
)
train_intensity_cp = coverage_probability(
    prediction = current_data.get_sample('intensity'),
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95,
    cells = train_cells
)
test_intensity_cp = coverage_probability(
    prediction = current_data.get_sample('intensity'),
    ground_truth = ins.data.ground_truth_table,
    region_mass = 0.95,
    cells = test_cells
)

In [None]:
print(
    all_intensity_cp.mean(['origin','destination'],skipna=True).values.item(),
    train_intensity_cp.mean(['origin','destination'],skipna=True).values.item(),
    test_intensity_cp.mean(['origin','destination'],skipna=True).values.item()
)

In [None]:
_ = plt.scatter(
    np.exp(current_data.data.log_destination_attraction).mean('id').values.squeeze(),
    ins.data.destination_attraction_ts.squeeze()
)
plt.xlabel("Predictions")
plt.ylabel("Data")

In [None]:
_ = plt.hist(current_data.data.alpha.squeeze().values,bins=30)
plt.xlabel('alpha')

In [None]:
_ = plt.hist(current_data.data.beta.squeeze().values,bins=30)
plt.xlabel('beta')

In [None]:
_ = plt.hist2d(
    current_data.data.beta.squeeze().values,
    current_data.data.alpha.squeeze().values,
    bins = 30
)
plt.ylabel('alpha')
plt.xlabel('beta')