autogluon · Innixma · Apr 6, 2024 · Apr 5, 2024 · Apr 5, 2024 · Innixma
diff --git a/core/src/autogluon/core/trainer/abstract_trainer.py b/core/src/autogluon/core/trainer/abstract_trainer.py
@@ -80,8 +80,68 @@
 
 # TODO: Dynamic model loading for ensemble models during prediction, only load more models if prediction is uncertain. This dynamically reduces inference time.
 # TODO: Try midstack Semi-Supervised. Just take final models and re-train them, use bagged preds for SS rows. This would be very cheap and easy to try.
-# TODO: Move to autogluon.core
 class AbstractTrainer:
+    """
+    AbstractTrainer contains logic to train a variety of models under a variety of constraints and automatically generate a multi-layer stack ensemble.
+    Beyond the basic functionality, it also has support for model refitting, distillation, pseudo-labelling, unlabeled data, and much more.
+
+    It is not recommended to directly use Trainer. Instead, use Predictor or Learner which internally uses Trainer.
+    This documentation is for developers. Users should avoid this class.
+
+    Due to the complexity of the logic within this class, a text description will not give the full picture.
+    It is recommended to carefully read the code and use a debugger to understand how it works.
+
+    AbstractTrainer makes much fewer assumptions about the problem than Learner and Predictor.
+    It expects these ambiguities to have already been resolved upstream. For example, problem_type, feature_metadata, num_classes, etc.
+
+    Parameters
+    ----------
+    path : str
+        Path to save and load trainer artifacts to disk.
+        Path should end in `/` or `os.path.sep()`.
+    problem_type : str
+        One of ['binary', 'multiclass', 'regression', 'quantile', 'softclass']
+    num_classes : int
+        The number of classes in the problem.
+        If problem_type is in ['regression', 'quantile'], this must be None.
+        If problem_type is 'binary', this must be 2.
+        If problem_type is in ['multiclass', 'softclass'], this must be >= 2.
+    feature_metadata : FeatureMetadata
+        FeatureMetadata for X. Sent to each model during fit.
+    eval_metric : Scorer, default = None
+        Metric to optimize. If None, a default metric is used depending on the problem_type.
+    quantile_levels : List[float] | np.ndarray, default = None
+        # TODO: Add documentation, not documented in Predictor.
+        Only used when problem_type=quantile
+    low_memory : bool, default = True
+        Deprecated parameter, likely to be removed in future versions.
+        If True, caches models to disk separately instead of containing all models within memory.
+        If False, may cause a variety of bugs.
+    k_fold : int, default = 0
+        If <2, then non-bagged mode is used.
+        If >= 2, then bagged mode is used with num_bag_folds == k_fold for each model.
+        Bagged mode changes the way models are trained and ensembled.
+        Bagged mode enables multi-layer stacking and repeated bagging.
+    n_repeats : int, default = 1
+        The maximum repeats of bagging to do when in bagged mode.
+        Larger values take linearly longer to train and infer, but improves quality slightly.
+    sample_weight : str, default = None
+        Column name of the sample weight in X
+    weight_evaluation : bool, default = False
+        If True, the eval_metric is calculated with sample_weight incorporated into the score.
+    save_data : bool, default = True
+        Whether to cache the data (X, y, X_val, y_val) to disk.
+        Required for a variety of advanced post-fit functionality.
+        It is recommended to keep as True.
+    random_state : int, default = 0
+        Random state for data splitting in bagged mode.
+    verbosity : int, default = 2
+        Verbosity levels range from 0 to 4 and control how much information is printed.
+        Higher levels correspond to more detailed print statements (you can set verbosity = 0 to suppress warnings).
+        If using logging, you can alternatively control amount of information printed via `logger.setLevel(L)`,
+        where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print statements, opposite of verbosity levels).
+    """
+
     trainer_file_name = "trainer.pkl"
     trainer_info_name = "info.pkl"
     trainer_info_json_name = "info.json"
@@ -90,20 +150,23 @@ class AbstractTrainer:
     def __init__(
         self,
         path: str,
+        *,
         problem_type: str,
-        eval_metric=None,
-        num_classes=None,
-        quantile_levels=None,
-        low_memory=False,
-        feature_metadata=None,
-        k_fold=0,
-        n_repeats=1,
-        sample_weight=None,
-        weight_evaluation=False,
-        save_data=False,
-        random_state=0,
-        verbosity=2,
+        num_classes: int = None,
+        feature_metadata: FeatureMetadata = None,
+        eval_metric: Scorer = None,
+        quantile_levels: List[float] | np.ndarray = None,
+        low_memory: bool = True,
+        k_fold: int = 0,
+        n_repeats: int = 1,
+        sample_weight: str = None,
+        weight_evaluation: bool = False,
+        save_data: bool = False,
+        random_state: int = 0,
+        verbosity: int = 2,
     ):
+        self._validate_num_classes(num_classes=num_classes, problem_type=problem_type)
+        self._validate_quantile_levels(quantile_levels=quantile_levels, problem_type=problem_type)
         self.path = path
         self.problem_type = problem_type
         self.feature_metadata = feature_metadata
@@ -3836,3 +3899,23 @@ def calibrate_decision_threshold(
             verbose=verbose,
             **kwargs,
         )
+
+    @staticmethod
+    def _validate_num_classes(num_classes: int, problem_type: str):
+        if problem_type == BINARY:
+            assert num_classes is not None and num_classes == 2, f"num_classes must be 2 when problem_type='{problem_type}' (num_classes={num_classes})"
+        elif problem_type in [MULTICLASS, SOFTCLASS]:
+            assert num_classes is not None and num_classes >= 2, f"num_classes must be >=2 when problem_type='{problem_type}' (num_classes={num_classes})"
+        elif problem_type in [REGRESSION, QUANTILE]:
+            assert num_classes is None, f"num_clases must be None when problem_type='{problem_type}' (num_classes={num_classes})"
+        else:
+            raise AssertionError(f"Unknown problem_type: '{problem_type}'. Valid problem types: {[BINARY, MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE]}")
+
+    @staticmethod
+    def _validate_quantile_levels(quantile_levels: List[float] | np.array, problem_type: str):
+        if problem_type == QUANTILE:
+            assert quantile_levels is not None, f"quantile_levels must not be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
+            assert isinstance(quantile_levels, (list, np.ndarray)), f"quantile_levels must be a list or np.ndarray (quantile_levels={quantile_levels})"
+            assert len(quantile_levels) > 0, f"quantile_levels must not be an empty list (quantile_levels={quantile_levels})"
+        else:
+            assert quantile_levels is None, f"quantile_levels must be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
diff --git a/tabular/src/autogluon/tabular/predictor/predictor.py b/tabular/src/autogluon/tabular/predictor/predictor.py
@@ -70,8 +70,6 @@
 # Extra TODOs (Stretch): Can occur post v1.0
 # TODO: make core_kwargs a kwargs argument to predictor.fit
 # TODO: add aux_kwargs to predictor.fit
-# TODO: add pip freeze + python version output after fit + log file, validate that same pip freeze on load as cached
-# TODO: Add logging comments that models are serialized on disk after fit
 # TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation.
 # TODO: Resolve raw text feature usage in default feature generator
 # TODO: num_bag_sets -> ag_args