## Config

This notebook gives several example for the `model_utils.BaseConfig` module

In [6]:
import os
from typing import Literal

from model_utils import BaseConfig, UNIMPLEMENTED

class ICRDatasetConfig(BaseConfig):
    """Configs for Dataset
    Define required configs here, type of the config should be explicitly labeled.

    The configs can be assigned a default/recommended value. e.g.
        >>> train_split_ratio: float = .85
    
    Otherwise, the configs should be set as the specifial value `UNIMPLEMENTED`.
    With the specifical the value, error will raise in runtime if user do not set
    the reqiured config. E.g.
        >>> train_batch_size: int = UNIMPLEMENTED
    """

    train_split_ratio: float = .85

    class_undersample_rate: float = 1.5
    """class_undersampe_rate == #samples_in_class_0 / #samples_in_class_1"""


    # ------    config for torch.DataLoader, see `help(DataLoader)` --------
    train_batch_size: int = UNIMPLEMENTED
    eval_batch_size: int = UNIMPLEMENTED
    num_workers: int = os.cpu_count()
    pin_memory: bool = True
    @property
    def persistent_workers(self):
        """dynamic config value, always return True when not running on Windows"""
        return self.num_workers > 0 and os.name == 'nt' # 'nt' == Microsoft.Windows

In [7]:
config = ICRDatasetConfig(
    train_batch_size=128,
    eval_batch_size=256,
)

config.check_and_freeze() # check whether the set configs are valid and legal
config.display()


Configuration:
_is_protocol                   False
class_undersample_rate         1.5
eval_batch_size                256
num_workers                    12
persistent_workers             True
pin_memory                     True
train_batch_size               128
train_split_ratio              0.85



In [8]:
config = ICRDatasetConfig(
    train_batch_size=128,
)
config.check_and_freeze()
# raise NotImplementedError, since eval_batch_size is not set.
config.display()

NotImplementedError: attribute: 'eval_batch_size' should be implemented

- config can be set after the construstor, but not after `check_and_freeze()`

In [9]:
config = ICRDatasetConfig(
    train_batch_size=128,
)
config.eval_batch_size = 1024 # this is OK
config.check_and_freeze()
config.display()

Configuration:
_is_protocol                   False
class_undersample_rate         1.5
eval_batch_size                1024
num_workers                    12
persistent_workers             True
pin_memory                     True
train_batch_size               128
train_split_ratio              0.85



In [10]:
config = ICRDatasetConfig(
    train_batch_size=128,
)
config.eval_batch_size = 1024 # OK!
config.check_and_freeze() # make the config no longer mutable
config.eval_batch_size = 2048 # cause error
config.display()

AttributeError: attribute 'eval_batch_size' is immutable.

## Example Dataset Class Works with the Config

In [11]:

from torch.utils.data import Dataset, DataLoader

class ICRDataset(Dataset):
    """ICR Dataset

    Responsibilities:
        1. Load data from csv files
            - split them to train, test sets
            - 85:15
        1. Preprocessing
            - missing value
            - categorical encoding
        1. Dataloader 
            - **undersampling**

    """


    def __init__(
        self,
        split: Literal['train', 'test', 'infer'],
        config: ICRDatasetConfig,
    ):
        self.split = split
        self.config = config
        return

    @property
    def dataloader(self) -> DataLoader:
        """_summary_

        Returns:
            DataLoader: _description_
        """
        batch_size = (
            self.config.train_batch_size if self.split == 'train' else\
            self.config.eval_batch_size
        )
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=(self.split == 'train'),
            num_workers=min(self.config.num_workers, batch_size),
            persistent_workers=self.config.persistent_workers,
            drop_last=(self.split == 'train'),
            pin_memory=self.config.pin_memory,
        )

    def load(self):
        pass

    def __getitem__(self, index):
        """_summary_

        Args:
            index (_type_): _description_
        """
        pass
    

## Advanced

### Explicit Checking Hooks for Pitfalls

for example, parameter `num_workers` for `DataLoader` is meanless setting greater the cpu counts.
We can then set a checking hook to prevent this.

In [12]:
@ICRDatasetConfig.register_checking_hook
def check_num_workers(config: ICRDatasetConfig):
    assert config.num_workers <= os.cpu_count(),\
        (
            f'expect num_workers <= cpu_count==({os.cpu_count()}), '
            f'but got num_workers={config.num_workers}'
        )
    return

In [13]:
config = ICRDatasetConfig()
config.train_batch_size = 1
config.eval_batch_size = 1
config.num_workers = 100
config.check_and_freeze()
config.display()

AssertionError: expect num_workers <= cpu_count==(12), but got num_workers=100