In [1]:
import os
import glob
import optuna
import warnings
import h5py as h5
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from copy import deepcopy

from gensit.config import Config
from gensit.inputs import Inputs
from gensit.outputs import Outputs
from gensit.utils.misc_utils import *
from gensit.utils.math_utils import *
from gensit.utils.probability_utils import *
from gensit.contingency_table import instantiate_ct
from gensit.contingency_table.MarkovBasis import instantiate_markov_basis

In [2]:
%matplotlib inline

# AUTO RELOAD EXTERNAL MODULES
%load_ext autoreload
%autoreload 2

## Import samples

In [105]:
# Specify experiment id
experiment_id = "JointTableSIM_NN_SweepedNoise__totally_and_cell_constrained_21_05_2024_13_25_40"
# Specify experiment group id
experiment_group_id = 'exp1/'
dataset = 'DC'
experiment_dir = f'../../data/outputs/{dataset}/{experiment_group_id}/{experiment_id}/'
relative_experiment_dir = os.path.relpath(experiment_dir,os.getcwd())

In [106]:
# Create new logging object
logger = setup_logger(
    __name__,
    console_level = 'INFO',
    file_level = 'EMPTY'
)

In [107]:
# Output processing settings
settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [
        "da.loss_name==str(['dest_attraction_ts_likelihood_loss', 'table_likelihood_loss'])",
        "da.sigma==0.14142"
    ],
    "slice":True,
    "metadata_keys":[],
    "burnin_thinning_trimming": [{'iter': {"burnin":10000, "thinning":1, "trimming":100000}}],
    "sample":["table"],
    "group_by":[],
    "filename_ending":"test",
    "force_reload":True,
    "n_workers": 1
}

In [None]:
# Initialise outputs
jointgensit_outputs = Outputs(
    config = os.path.join(relative_experiment_dir,"config.json"),
    settings = settings,
    inputs = None,
    slice = True,
    level = 'NOTE'
)
# Silence outputs
jointgensit_outputs.logger.setLevels(console_level='NOTE')
# Load all data
jointgensit_outputs.load()

# Get data from first sweep of the SIM_NN experiment
jointgensit_outputs = jointgensit_outputs.get(0)

12:13.569 config INFO ----------------------------------------------------------------------------------
12:13.586 config INFO Parameter space size: 
 --- sigma: ['sigma', 'to_learn'] (3)
 --- loss_name: ['loss_name', 'loss_function', 'loss_kwargs'] (2)
12:13.603 config INFO Total = 6.
12:13.620 config INFO ----------------------------------------------------------------------------------
12:13.691 outputs INFO //////////////////////////////////////////////////////////////////////////////////
12:13.708 outputs INFO Slicing coordinates:
12:13.725 outputs INFO loss_name==str(['dest_attraction_ts_likelihood_loss', 'table_likelihood_loss'])
12:13.741 outputs INFO sigma==0.14142
12:13.758 outputs INFO iter: burnin = 10000, thinning = 1, trimming = 100000
12:13.775 outputs INFO //////////////////////////////////////////////////////////////////////////////////
12:14.457 outputs INFO Collecting samples table.
Collecting h5 data sequentially:   0%|          | 0/6 [00:00<?, ?it/s]12:14.616 input

In [None]:
jointgensit_outputs.coords

In [19]:
inputs = Inputs(
    config = jointgensit_outputs.config,
    synthetic_data = False,
    logger = jointgensit_outputs.logger
)
inputs.cast_to_xarray()

56:37.241 inputs NOTE Loading Harris Wilson data ...
56:37.344 inputs NOTE Margins not provided
56:37.360 inputs NOTE Cells subset values file not provided


In [88]:
ground_truth_colsums = inputs.data.ground_truth_table.where(inputs.data.test_cells_mask,drop=True).sum('origin',skipna=True)

In [96]:
jointgensit_table_colsums_mean = jointgensit_outputs.data.table.mean('iter',dtype='float64',skipna=True).where(inputs.data.test_cells_mask,drop=True).sum('origin',skipna=True)

In [99]:
jointgensit_table_colsums_mean.sizes

Frozen({'destination': 43, 'sweep': 1})

In [101]:
ground_truth_colsums.sizes

Frozen({'destination': 43})

In [102]:
jointgensit_table_colsums_mean,ground_truth_colsums = xr.broadcast(jointgensit_table_colsums_mean,ground_truth_colsums)
jointgensit_table_colsums_mean,ground_truth_colsums = xr.align(jointgensit_table_colsums_mean,ground_truth_colsums, join='exact')
jointgensit_relative_colsum_l1_error = (jointgensit_table_colsums_mean-ground_truth_colsums)/ground_truth_colsums

In [103]:
abs(jointgensit_relative_colsum_l1_error).sum()

In [104]:
jointgensit_relative_colsum_l1_error.squeeze('sweep')

In [84]:
srmse(
    prediction = jointgensit_outputs.data.table.mean('iter',dtype='float64'),
    ground_truth = inputs.data.ground_truth_table,
    mask = inputs.data.test_cells_mask
)

In [66]:
# Output processing settings
gmel_settings = {
    "logging_mode": "INFO",
    "coordinate_slice": [],
    "slice":False,
    "metadata_keys":[],
    "burnin_thinning_trimming": [],
    "sample":["intensity"],
    "group_by":[],
    "filename_ending":"test",
    "force_reload":True,
    "n_workers": 1
}

# Initialise outputs
gmel_outputs = Outputs(
    config = f'../../data/outputs/{dataset}/comparisons/GraphAttentionNetwork_Comparison_UnsetNoise__doubly_and_cell_constrained_all_region_features_16_05_2024_21_06_14/config.json',
    settings = gmel_settings,
    inputs = None,
    slice = True,
    level = 'NOTE'
)
# Silence outputs
gmel_outputs.logger.setLevels(console_level='NOTE')
# Load all data
gmel_outputs.load()

# Get data from first sweep of the experiment
gmel_outputs = gmel_outputs.get(0)

01:10.487 config INFO ----------------------------------------------------------------------------------
01:10.503 config INFO Parameter space size: 
 --- seed (10)
01:10.519 config INFO Total = 10.
01:10.535 config INFO ----------------------------------------------------------------------------------
01:10.719 outputs INFO Collecting samples intensity.
Collecting h5 data sequentially:   0%|          | 0/10 [00:00<?, ?it/s]01:10.779 inputs NOTE Loading Harris Wilson data ...
01:10.883 inputs NOTE Margins not provided
01:10.899 inputs NOTE Cells subset values file not provided
01:10.967 outputs NOTE Loading h5 data into xarrays...
01:13.802 outputs NOTE Populating data dictionary
01:14.233 outputs PROGRESS intensity: (1, 10000, 179, 179)
Collecting h5 data sequentially:  10%|█         | 1/10 [00:03<00:32,  3.58s/it]01:14.326 inputs NOTE Loading Harris Wilson data ...
01:14.387 inputs NOTE Margins not provided
01:14.396 inputs NOTE Cells subset values file not provided
01:14.435 outputs

In [67]:
gmel_intensity_colsums_mean = gmel_outputs.data.intensity.where(inputs.data.test_cells_mask,drop=True).groupby('seed').sum('origin').mean('iter',dtype='float64')

In [68]:
gmel_relative_colsum_l1_error = ((gmel_intensity_colsums_mean-ground_truth_colsums)/ground_truth_colsums).mean('seed',dtype='float64')

In [69]:
abs(gmel_relative_colsum_l1_error).sum()

In [77]:
gmel_outputs.data.intensity.groupby('seed').mean('iter',dtype='float64').groupby('seed').map(
    srmse,
    ground_truth = inputs.data.ground_truth_table,
    mask = inputs.data.test_cells_mask
).mean('seed')