Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve AbstractTrainer Docs #2024

Merged
merged 2 commits into from
Apr 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 96 additions & 13 deletions core/src/autogluon/core/trainer/abstract_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,68 @@

# TODO: Dynamic model loading for ensemble models during prediction, only load more models if prediction is uncertain. This dynamically reduces inference time.
# TODO: Try midstack Semi-Supervised. Just take final models and re-train them, use bagged preds for SS rows. This would be very cheap and easy to try.
# TODO: Move to autogluon.core
class AbstractTrainer:
"""
AbstractTrainer contains logic to train a variety of models under a variety of constraints and automatically generate a multi-layer stack ensemble.
Beyond the basic functionality, it also has support for model refitting, distillation, pseudo-labelling, unlabeled data, and much more.
It is not recommended to directly use Trainer. Instead, use Predictor or Learner which internally uses Trainer.
This documentation is for developers. Users should avoid this class.
Due to the complexity of the logic within this class, a text description will not give the full picture.
It is recommended to carefully read the code and use a debugger to understand how it works.
AbstractTrainer makes much fewer assumptions about the problem than Learner and Predictor.
It expects these ambiguities to have already been resolved upstream. For example, problem_type, feature_metadata, num_classes, etc.
Parameters
----------
path : str
Path to save and load trainer artifacts to disk.
Path should end in `/` or `os.path.sep()`.
problem_type : str
One of ['binary', 'multiclass', 'regression', 'quantile', 'softclass']
num_classes : int
The number of classes in the problem.
If problem_type is in ['regression', 'quantile'], this must be None.
If problem_type is 'binary', this must be 2.
If problem_type is in ['multiclass', 'softclass'], this must be >= 2.
feature_metadata : FeatureMetadata
FeatureMetadata for X. Sent to each model during fit.
eval_metric : Scorer, default = None
Metric to optimize. If None, a default metric is used depending on the problem_type.
quantile_levels : List[float] | np.ndarray, default = None
# TODO: Add documentation, not documented in Predictor.
Only used when problem_type=quantile
low_memory : bool, default = True
Deprecated parameter, likely to be removed in future versions.
If True, caches models to disk separately instead of containing all models within memory.
If False, may cause a variety of bugs.
k_fold : int, default = 0
If <2, then non-bagged mode is used.
If >= 2, then bagged mode is used with num_bag_folds == k_fold for each model.
Bagged mode changes the way models are trained and ensembled.
Bagged mode enables multi-layer stacking and repeated bagging.
n_repeats : int, default = 1
The maximum repeats of bagging to do when in bagged mode.
Larger values take linearly longer to train and infer, but improves quality slightly.
sample_weight : str, default = None
Column name of the sample weight in X
weight_evaluation : bool, default = False
If True, the eval_metric is calculated with sample_weight incorporated into the score.
save_data : bool, default = True
Whether to cache the data (X, y, X_val, y_val) to disk.
Required for a variety of advanced post-fit functionality.
It is recommended to keep as True.
random_state : int, default = 0
Random state for data splitting in bagged mode.
verbosity : int, default = 2
Verbosity levels range from 0 to 4 and control how much information is printed.
Higher levels correspond to more detailed print statements (you can set verbosity = 0 to suppress warnings).
If using logging, you can alternatively control amount of information printed via `logger.setLevel(L)`,
where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print statements, opposite of verbosity levels).
"""

trainer_file_name = "trainer.pkl"
trainer_info_name = "info.pkl"
trainer_info_json_name = "info.json"
Expand All @@ -90,20 +150,23 @@ class AbstractTrainer:
def __init__(
self,
path: str,
*,
problem_type: str,
eval_metric=None,
num_classes=None,
quantile_levels=None,
low_memory=False,
feature_metadata=None,
k_fold=0,
n_repeats=1,
sample_weight=None,
weight_evaluation=False,
save_data=False,
random_state=0,
verbosity=2,
num_classes: int = None,
feature_metadata: FeatureMetadata = None,
eval_metric: Scorer = None,
quantile_levels: List[float] | np.ndarray = None,
low_memory: bool = True,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: low_memory was previous default False, however in all cases AbstractTrainer was initialized, it was set to True. False is not actively supported, so I've updated the default to True here, should have no impact.

k_fold: int = 0,
n_repeats: int = 1,
sample_weight: str = None,
weight_evaluation: bool = False,
save_data: bool = False,
random_state: int = 0,
verbosity: int = 2,
):
self._validate_num_classes(num_classes=num_classes, problem_type=problem_type)
self._validate_quantile_levels(quantile_levels=quantile_levels, problem_type=problem_type)
self.path = path
self.problem_type = problem_type
self.feature_metadata = feature_metadata
Expand Down Expand Up @@ -3836,3 +3899,23 @@ def calibrate_decision_threshold(
verbose=verbose,
**kwargs,
)

@staticmethod
def _validate_num_classes(num_classes: int, problem_type: str):
if problem_type == BINARY:
assert num_classes is not None and num_classes == 2, f"num_classes must be 2 when problem_type='{problem_type}' (num_classes={num_classes})"
elif problem_type in [MULTICLASS, SOFTCLASS]:
assert num_classes is not None and num_classes >= 2, f"num_classes must be >=2 when problem_type='{problem_type}' (num_classes={num_classes})"
elif problem_type in [REGRESSION, QUANTILE]:
assert num_classes is None, f"num_clases must be None when problem_type='{problem_type}' (num_classes={num_classes})"
else:
raise AssertionError(f"Unknown problem_type: '{problem_type}'. Valid problem types: {[BINARY, MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE]}")

@staticmethod
def _validate_quantile_levels(quantile_levels: List[float] | np.array, problem_type: str):
if problem_type == QUANTILE:
assert quantile_levels is not None, f"quantile_levels must not be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
assert isinstance(quantile_levels, (list, np.ndarray)), f"quantile_levels must be a list or np.ndarray (quantile_levels={quantile_levels})"
assert len(quantile_levels) > 0, f"quantile_levels must not be an empty list (quantile_levels={quantile_levels})"
else:
assert quantile_levels is None, f"quantile_levels must be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
2 changes: 0 additions & 2 deletions tabular/src/autogluon/tabular/predictor/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,6 @@
# Extra TODOs (Stretch): Can occur post v1.0
# TODO: make core_kwargs a kwargs argument to predictor.fit
# TODO: add aux_kwargs to predictor.fit
# TODO: add pip freeze + python version output after fit + log file, validate that same pip freeze on load as cached
# TODO: Add logging comments that models are serialized on disk after fit
# TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation.
# TODO: Resolve raw text feature usage in default feature generator
# TODO: num_bag_sets -> ag_args
Expand Down
Loading