In [1]:
from datetime import datetime

import os
from pathlib import Path
import subprocess
import warnings
import re
import pickle
import pandas as pd
import functools

import jax
import jax.numpy as jnp

from src.config.core import Config
from src.config.sampler import Sampler
from src.config.data import DatasetType
import src.dataset as ds
from src.models.tabular import FCN
import src.training.utils as train_utils
import src.inference.utils as inf_utils
import src.visualization as viz
from src.config.data import Task
from src.inference.evaluation import evaluate_bde

from matplotlib import pyplot as plt
import numpy as np

2025-06-18 12:08:50,196 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:08:50,198 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:08:50,640 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:08:50,641 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:08:50,642 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory


### Config template

In [2]:
CONFIG_DICT = {
    'saving_dir': 'results/',
    'experiment_name': 'bike',
    'data': {
        'path': 'data/bikesharing.data',
        'source': 'local',
        'data_type': 'tabular',
        'task': 'regr',
        'target_column': None,
        'target_len': 1,
        'features': None,
        'datapoint_limit': None,
        'normalize': True,
        'train_split': 0.7,
        'valid_split': 0.1,
        'test_split': 0.2,
    },
    'model': {
        'model': 'FCN',
        'hidden_structure': [16, 16, 16, 2],
        'activation': 'relu',
        'use_bias': True,
    },
    'training': {
        'warmstart': { # meaningless placeholder
            'include': False,
            'optimizer_config': {'name': "sgd", 'parameters': {}}
        },
        'sampler': {
            'name': 'sgld',
            'warmup_steps': 0,
            'n_chains': 4,
            'n_samples': 24000,  # total steps
            'batch_size': 512,
            'step_size_init': 2.0e-6,  # step_size_explore
            'n_thinning': 1,
            'keep_warmup': False,
            'optimizer_name': 'sgd',
            'prior_config': {
                'name': 'StandardNormal'
            },
            'scheduler_config': {
                'name': 'Cyclical',
                'n_samples_per_cycle': 200,
                'parameters': {
                    'n_cycles': 4,
                }
            }
        }
    },
    'rng': 1446,
    'logging': False,
}

In [3]:
def get_config(
        exp_name: str = 'bike',
        n_chains: int = 4,
        n_cycles: int = 4,
        n_steps_per_cycle: int = 2000,
        n_samples_per_cycle: int = 200,
        n_thinning: int = 1,
        optimizer_name: str = 'adam',
        scheduler_name: str = 'Cyclical',
        step_size_init: float = 2.0e-6,
        step_size_sampling: float | None = None,
        seed: int = 0
    ):
    n_samples = n_cycles * n_steps_per_cycle

    new_config_dict = CONFIG_DICT.copy()
    new_config_dict['experiment_name'] = exp_name
    new_config_dict['training']['sampler'] = {
        'name': 'sgld',
        'warmup_steps': 0,
        'keep_warmup': False,
        'n_chains': n_chains,
        'n_samples': n_samples,  # total steps
        'batch_size': 512,
        'step_size_init': step_size_init,  # step_size_explore
        'n_thinning': n_thinning,
        'optimizer_name': optimizer_name,
        'prior_config': {
            'name': 'StandardNormal'
        },
        'scheduler_config': {
            'name': scheduler_name,
            'n_samples_per_cycle': n_samples_per_cycle,
            'parameters': {
                'n_cycles': n_cycles,
                'step_size_sampling': step_size_sampling
            }
        }
    }
    new_config_dict['rng'] = seed

    # datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    new_config_dict['saving_dir'] = f'results/'
    config_path = Path('experiments/csgld') / f'{exp_name}.yaml'
    # if config_path.exists():
    #     warnings.warn(f"Config file {config_path} already exists. Overwriting it.")
    if not config_path.parent.exists():
        config_path.parent.mkdir(parents=True)
    Config.from_dict(new_config_dict).to_yaml(config_path)

    return config_path

### Chains/Cycles

In [4]:
chains_cycles = [2, 4, 6, 8, 10, 12]
seeds = [i for i in range(5)]

### Constant Schedule

In [5]:
get_config_constant = functools.partial(
    get_config,
    n_steps_per_cycle=2500,
    n_samples_per_cycle=500,
    n_thinning=1,
    optimizer_name='adam',
    scheduler_name='Constant',
    step_size_init=0.01,
    step_size_sampling=1.0e-8,
)

In [6]:
import logging
import sys
logging.disable(sys.maxsize)

In [7]:
# parallel
config_paths_p = []
for i, n in enumerate(chains_cycles):
    if n != 12:
        continue
    for seed in seeds:
        exp_name = f'bike2/parallel_constant_{n}_seed{seed}'
        config_path = get_config_constant(
            exp_name=exp_name,
            n_chains=n,
            n_cycles=1,
            seed=seed+i
        )
        config_paths_p.append(config_path)

for config_path in config_paths_p:
    print("=" * 50)
    print(f'Running training for config: {config_path}')
    subprocess.run(['python', 'train.py', '-c', str(config_path), '-d', '12'])

Running training for config: experiments/csgld/bike2/parallel_constant_12_seed0.yaml
2025-06-18 12:08:55,847 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 12:08:56,392 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:08:56,392 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:08:56,880 - __main__ - INFO - > Running experiment: bike2/parallel_constant_12_seed0
2025-06-18 12:08:56,890 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:08:56,890 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:08:56,890 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 12:08:56,935 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 2500/2500 [00:17<00:00, 140.59it/s]


Running training for config: experiments/csgld/bike2/parallel_constant_12_seed1.yaml
2025-06-18 12:10:40,627 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 12:10:41,206 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:10:41,207 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:10:41,591 - __main__ - INFO - > Running experiment: bike2/parallel_constant_12_seed1
2025-06-18 12:10:41,600 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:10:41,600 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:10:41,601 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 12:10:41,647 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 2500/2500 [00:17<00:00, 142.91it/s]


Running training for config: experiments/csgld/bike2/parallel_constant_12_seed2.yaml
2025-06-18 12:12:24,636 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 12:12:25,208 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:12:25,209 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:12:25,592 - __main__ - INFO - > Running experiment: bike2/parallel_constant_12_seed2
2025-06-18 12:12:25,602 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:12:25,602 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:12:25,602 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 12:12:25,643 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 2500/2500 [00:16<00:00, 148.26it/s]


Running training for config: experiments/csgld/bike2/parallel_constant_12_seed3.yaml
2025-06-18 12:14:06,983 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 12:14:07,517 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:14:07,518 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:14:07,891 - __main__ - INFO - > Running experiment: bike2/parallel_constant_12_seed3
2025-06-18 12:14:07,900 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:14:07,900 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:14:07,901 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 12:14:07,945 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 2500/2500 [00:17<00:00, 143.66it/s]


Running training for config: experiments/csgld/bike2/parallel_constant_12_seed4.yaml
2025-06-18 12:15:50,713 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 12:15:51,250 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 12:15:51,251 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 12:15:51,627 - __main__ - INFO - > Running experiment: bike2/parallel_constant_12_seed4
2025-06-18 12:15:51,636 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 12:15:51,636 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 12:15:51,637 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 12:15:51,680 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 2500/2500 [00:43<00:00, 57.84it/s] 


In [None]:
# sequential
config_paths_c = []
max_cycles = int(np.max(chains_cycles))
for seed in seeds:
    exp_name = f'bike/sequential_constant_{max_cycles}_seed{seed}'
    config_path = get_config_constant(
        exp_name=exp_name,
        n_chains=10,
        n_cycles=max_cycles,
        seed=seed
    )
    # print(f'Config saved to {config_path}')
    config_paths_c.append(config_path)

for config_path in config_paths_c:
    print("=" * 50)
    print(f'Running training for config: {config_path}')
    subprocess.run(['python', 'train.py', '-c', str(config_path), '-d', '12'])

In [7]:
exp_name = 'bike/sequential_constant_4_seed3'
config_path = get_config_constant(
    exp_name=exp_name,
    n_chains=1,
    n_cycles=4,
    seed=3
)
subprocess.run(['python', 'train.py', '-c', str(config_path), '-d', '12'])

2025-06-18 10:55:25,631 - __main__ - INFO - Loaded 1 Experiment(s)
2025-06-18 10:55:26,462 - datasets - INFO - PyTorch version 2.2.2+cpu available.
2025-06-18 10:55:26,463 - datasets - INFO - JAX version 0.4.28 available.
2025-06-18 10:55:27,064 - __main__ - INFO - > Running experiment: bike/sequential_constant_4_seed3
2025-06-18 10:55:27,074 - jax._src.xla_bridge - INFO - Unable to initialize backend 'cuda': 
2025-06-18 10:55:27,074 - jax._src.xla_bridge - INFO - Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2025-06-18 10:55:27,075 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
2025-06-18 10:55:27,128 - src.training.trainer - INFO - > Setting up directories...


Sampling: 100%|██████████| 10000/10000 [00:28<00:00, 347.86it/s]


CompletedProcess(args=['python', 'train.py', '-c', 'experiments/csgld/bike/sequential_constant_4_seed3.yaml', '-d', '12'], returncode=0)

### Cyclical Schedule

In [None]:
get_config_cyclical = functools.partial(
    get_config,
    n_steps_per_cycle=6000,
    n_samples_per_cycle=200,
    n_thinning=1,
    optimizer_name='sgd',
    scheduler_name='Cyclical',
    step_size_init=2.0e-6
)

In [None]:
# parallel
config_paths_p = []
for i, n in enumerate(chains_cycles):
    for seed in seeds:
        exp_name = f'bike/parallel_cyclical_{n}_seed{seed}'
        config_path = get_config_cyclical(
            exp_name=exp_name,
            n_chains=n,
            n_cycles=1,
            seed=seed
        )
        config_paths_p.append(config_path)

for config_path in config_paths_p:
    print("=" * 50)
    print(f'Running training for config: {config_path}')
    subprocess.run(['python', 'train.py', '-c', str(config_path), '-d', '12'])

In [None]:
# sequential
config_paths_s = []
max_cycles = int(np.max(chains_cycles))
# for seed in seeds:
exp_name = f'bike/sequential_cyclical_{max_cycles}_combined'
config_path = get_config_cyclical(
    exp_name=exp_name,
    n_chains=len(seeds),
    n_cycles=max_cycles,
    seed=0
)
config_paths_s.append(config_path)

for config_path in config_paths_s:
    print("=" * 50)
    print(f'Running training for config: {config_path}')
    subprocess.run(['python', 'train.py', '-c', str(config_path), '-d', '12'])