In [1]:
import h5py as h5
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy


from multiresticodm.config import Config
from multiresticodm.inputs import Inputs
from multiresticodm.outputs import Outputs
from multiresticodm.utils.misc_utils import *
from multiresticodm.utils.math_utils import *
from multiresticodm.utils.probability_utils import *
from multiresticodm.contingency_table import instantiate_ct
from multiresticodm.markov_basis import instantiate_markov_basis

In [2]:
%matplotlib inline

# AUTO RELOAD EXTERNAL MODULES
%load_ext autoreload
%autoreload 2

In [3]:
def validate_tables(out):
    if out.inputs is None:
        out.inputs = Inputs(
            config = out.config,
            synthetic_data = False,
            logger = out.logger
        )
    else:
        try:
            out.inputs.cast_from_xarray()
        except:
            pass
    ct = instantiate_ct(
        config = out.config,
        **out.inputs.data_vars(),
        level = 'EMPTY'
    )
    samples = out.get_sample('table')
    print(dict(samples.sizes))
    print('axes constraints',ct.constraints['constrained_axes'])
    print('cell constraints',len(ct.constraints['cells']))
    tables_admissible = all([ct.table_admissible(torch.tensor(tab.values.squeeze())) for _,tab in samples.groupby('id')])
    print('Tables admissible',tables_admissible)
    if not tables_admissible:
        print('Tables margins admissible',any([ct.table_margins_admissible(torch.tensor(tab.values.squeeze())) for _,tab in samples.groupby('id')]))
        print('Tables cells admissible',all([ct.table_cells_admissible(torch.tensor(tab.values.squeeze())) for _,tab in samples.groupby('id')]))

In [4]:
# Get important paths
experiment_id = 'NonJointTableSIM_NN_SweepedNoise_30_01_2024_23_25_12'
#'NonJointTableSIM_NN_SweepedNoise_26_01_2024_13_26_18', 'JointTableSIM_NN_SweepedNoise_23_01_2024_21_33_25'
experiment_dir = f'../data/outputs/cambridge_work_commuter_lsoas_to_msoas/exp1/{experiment_id}/'
relative_experiment_dir = os.path.relpath(experiment_dir,os.getcwd())

In [55]:
# Output processing settings
settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [
        "da.loss_name == str(['dest_attraction_ts_likelihood_loss'])",
        "~da.title.isin([str('_unconstrained'), str('_total_constrained')])"
    ],
    "metadata_keys":[],
    "burnin_thinning_trimming": [],
    # {'iter': {"burnin":100, "thinning":90, "trimming":10}}
    "n_workers": 1,
    "group_by":["seed"],
    "filename_ending":"test",
    "sample":["table"],
    "force_reload":False
}

In [14]:
sweep_id = 'loss_name_[dest_attraction_ts_likelihood_loss]/seed_23/'
sigmas = ['low','high','learned']
titles = ['_doubly_constrained','_doubly_10%_cell_constrained','_doubly_20%_cell_constrained']
progress = tqdm(
    total = len(sigmas)*len(titles),
    desc = 'Loading sweep data'
)
for sig in sigmas:
    for titl in titles:
        # Create current sweep id
        current_sweep_id = os.path.join('samples',sweep_id,f"sigma_{sig}/title_{titl}/")
        
        print(f"sigma_{sig}/title_{titl}/")
        # Initialise outputs
        current_sweep_outputs = Outputs(
            config = os.path.join(relative_experiment_dir,current_sweep_id),
            settings = settings,
            inputs = None,
            slice = True,
            level = 'INFO'
        )
        # Silence outputs
        current_sweep_outputs.logger.setLevels(console_level='EMPTY')
        # Load all data
        current_sweep_outputs.load()
        # Get first collection id
        current_sweep_outputs0 = current_sweep_outputs.get(0)
        # Validate tables
        # validate_tables(current_sweep_outputs0)
        print('SRMSE',srmse(current_sweep_outputs0.data.table.mean('id'),current_sweep_outputs.inputs.data.ground_truth_table).values.squeeze())
        print('\n')
        # break
    # break

Loading sweep data:   0%|          | 0/9 [01:14<?, ?it/s]


sigma_low/title__doubly_constrained/
{'disable_tqdm': True, 'hyperparameters': {'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.002, 'biases': {'default': [0.0, 4.0], 'layer_specific': {}}, 'nodes_per_layer': {'default': 20, 'layer_specific': {}}, 'activation_funcs': {'default': 'linear', 'layer_specific': {'1': 'abs'}}}}
sweep
{'sigma': 0.014139999635517597, 'to_learn': ['alpha', 'beta'], 'covariance': [[0.0149674, 0.00182529], [0.00182529, 0.0109968]], 'title': '_doubly_constrained', 'axes': [[0], [1]], 'cells': 'cells.txt', 'name': 'TotallyConstrained', 'proposal': 'degree_higher', 'loss_name': ['dest_attraction_ts_likelihood_loss'], 'loss_function': ['custom'], 'loss_kwargs': {'noise_percentage': None}}
{'disable_tqdm': True, 'hyperparameters': {'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.002, 'biases': {'default': [0.0, 4.0], 'layer_specific': {}}, 'nodes_per_layer': {'default': 20, 'layer_specific': {}}, 'activation_funcs': {'default': 'linear', 'layer_speci

In [56]:
# Initialise outputs
current_sweep_outputs = Outputs(
    config = relative_experiment_dir,
    settings = settings,
    inputs = None,
    slice = True,
    level = 'INFO'
)
# Silence outputs
current_sweep_outputs.logger.setLevels(console_level='EMPTY')
# Load all data
current_sweep_outputs.load()
# Get first collection id
current_sweep_outputs0 = current_sweep_outputs.get(0)
# Validate tables
# validate_tables(current_sweep_outputs0)

44:02.956 config INFO ----------------------------------------------------------------------------------
44:02.970 config INFO Parameter space size: 
 --- seed (100)
 --- sigma: ['sigma', 'to_learn', 'covariance'] (3)
 --- title: ['title', 'axes', 'cells', 'name', 'proposal'] (3)
 --- loss_name: ['loss_name', 'loss_function', 'loss_kwargs'] (2)
44:02.984 config INFO Total = 1800.
44:02.998 config INFO ----------------------------------------------------------------------------------
44:03.047 outputs INFO //////////////////////////////////////////////////////////////////////////////////
44:03.061 outputs INFO Slicing coordinates:
44:03.075 outputs INFO loss_name == str(['dest_attraction_ts_likelihood_loss'])
44:03.088 outputs INFO ~title.isin([str('_unconstrained'), str('_total_constrained')])
44:03.102 outputs INFO //////////////////////////////////////////////////////////////////////////////////
                                                                                 

{'disable_tqdm': True, 'loss': {'loss_name': {'sweep': {'default': ['dest_attraction_ts_loss'], 'range': [['dest_attraction_ts_loss'], ['dest_attraction_ts_likelihood_loss']]}}, 'loss_function': {'sweep': {'default': ['mseloss'], 'range': [['mseloss'], ['custom']], 'coupled': True, 'target_name': 'loss_name'}}, 'loss_kwargs': {'sweep': {'coupled': True, 'target_name': 'loss_name', 'default': {'nokey': nan}, 'range': [{'nokey': nan}, {'noise_percentage': nan}]}}}, 'hyperparameters': {'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.002, 'biases': {'default': [0.0, 4.0], 'layer_specific': {}}, 'nodes_per_layer': {'default': 20, 'layer_specific': {}}, 'activation_funcs': {'default': 'linear', 'layer_specific': {'1': 'abs'}}}}
sweep
{'sigma': 0.014139999635517597, 'to_learn': ['alpha', 'beta'], 'covariance': [[0.0149674, 0.00182529], [0.00182529, 0.0109968]], 'title': '_doubly_10%_cell_constrained', 'axes': [[0], [1]], 'cells': 'constraints/cell_constraints_permuted_size_90_cell_pe

IndexError: list index out of range

In [64]:
current_sweep_outputs0 = current_sweep_outputs.get(0)

In [66]:
current_sweep_outputs0.config.settings

{'log_level': 'info',
 'sweep_mode': True,
 'inputs': {'n_workers': 4,
  'n_threads': 6,
  'device': 'cpu',
  'in_directory': './data/inputs/',
  'dataset': 'cambridge_work_commuter_lsoas_to_msoas',
  'load_experiment': '',
  'seed': {'sweep': {'default': 1, 'range': ['0:99:1']}},
  'to_learn': ['alpha', 'beta'],
  'data': {'origin_demand': {'file': 'origin_demand_sum_normalised.txt'},
   'destination_attraction_ts': {'file': 'destination_attraction_time_series_sum_normalised.txt'},
   'cost_matrix': {'file': 'cost_matrices/clustered_facilities_sample_20x20_20_01_2023_sample_20x20_clustered_facilities_ripleys_k_500_euclidean_points%_prob_origin_destination_adjusted_normalised_boundary_only_edge_corrected_cost_matrix_max_normalised.txt'},
   'ground_truth_table': {'file': 'table_lsoas_to_msoas.txt'},
   'total_cost_by_origin': {'file': 'lsoas_total_distance_to_work.txt'}}},
 'contingency_table': {'sparse_margins': True,
  'constraints': {'axes': [[0], [1]],
   'cells': 'constraints/cell

In [23]:
sweep = {'sigma': 0.014139999635517597, 'to_learn': ['alpha', 'beta'], 'covariance': [[0.0149674, 0.00182529], [0.00182529, 0.0109968]], 'title': '_doubly_constrained', 'axes': [[0], [1]], 'cells': 'cells.txt', 'name': 'TotallyConstrained', 'proposal': 'degree_higher', 'loss_name': ['dest_attraction_ts_likelihood_loss'], 'loss_function': ['custom'], 'loss_kwargs': {'noise_percentage': None}}
sweep_configuration = current_sweep_outputs.config.convert_sweep(sweep)

In [19]:
current_sweep_outputs.config.update(sweep)

In [24]:
current_sweep_outputs.config['neural_network']

{'disable_tqdm': True,
 'loss': {'loss_name': {'sweep': {'default': ['dest_attraction_ts_loss'],
    'range': [['dest_attraction_ts_loss'],
     ['dest_attraction_ts_likelihood_loss']]}},
  'loss_function': {'sweep': {'default': ['mseloss'],
    'range': [['mseloss'], ['custom']],
    'coupled': True,
    'target_name': 'loss_name'}},
  'loss_kwargs': {'sweep': {'coupled': True,
    'target_name': 'loss_name',
    'default': {'nokey': nan},
    'range': [{'nokey': nan}, {'noise_percentage': nan}]}}},
 'hyperparameters': {'num_layers': 1,
  'optimizer': 'Adam',
  'learning_rate': 0.002,
  'biases': {'default': [0.0, 4.0], 'layer_specific': {}},
  'nodes_per_layer': {'default': 20, 'layer_specific': {}},
  'activation_funcs': {'default': 'linear', 'layer_specific': {'1': 'abs'}}}}

In [33]:
len(sweep_configuration)

11

In [42]:
current_sweep_outputs.config.sweep_params['isolated'].keys()

dict_keys(['seed'])

In [47]:
for sweep_group in current_sweep_outputs.config.sweep_params['coupled'].values():
    for sweep_group_elem in sweep_group:
        print(sweep_group_elem['var'])

sigma
to_learn
covariance
title
axes
cells
name
proposal
loss_name
loss_function
loss_kwargs


In [49]:
sweep

{'sigma': 0.014139999635517597,
 'to_learn': ['alpha', 'beta'],
 'covariance': [[0.0149674, 0.00182529], [0.00182529, 0.0109968]],
 'title': '_doubly_constrained',
 'axes': [[0], [1]],
 'cells': 'cells.txt',
 'name': 'TotallyConstrained',
 'proposal': 'degree_higher',
 'loss_name': ['dest_attraction_ts_likelihood_loss'],
 'loss_function': ['custom'],
 'loss_kwargs': {'noise_percentage': None}}

In [40]:
len(current_sweep_outputs.config.sweep_params['isolated'])+sum([len(sweep_group) for sweep_group in current_sweep_outputs.config.sweep_params['coupled'].values()])

12

In [25]:
temp_config = current_sweep_outputs.config.prepare_experiment_config(sweep_configuration)[0]

IndexError: list index out of range

# Temporary scripts

In [None]:
experiment_id = 'JointTableSIM_NN_SweepedNoise_23_01_2024_21_33_25'
experiment_dir = f'../data/outputs/cambridge_work_commuter_lsoas_to_msoas/exp1/{experiment_id}/'
relative_experiment_dir = os.path.relpath(experiment_dir,os.getcwd())

In [None]:
for d in os.walk(relative_experiment_dir):
    if 'doubly' in d[0]:
        for f in ['data.h5','metadata.json','outputs.log']:
            if os.path.exists(os.path.join(d[0],f)):
                os.remove(os.path.join(d[0],f))
        if os.path.exists(d[0]):
            os.rmdir(d[0])