From a88296e3938a4a44038fb52e0bf5474179820e47 Mon Sep 17 00:00:00 2001 From: Nick Erickson Date: Sat, 9 Dec 2023 17:26:42 -0800 Subject: [PATCH] Tabular: Move deprecated methods to mixin (#3795) --- .../tabular/tabular-essentials.ipynb | 309 +++++++++++++----- docs/tutorials/tabular/tabular-indepth.ipynb | 72 ++-- .../tabular/predictor/_deprecated_methods.py | 67 ++++ .../autogluon/tabular/predictor/predictor.py | 59 +--- 4 files changed, 346 insertions(+), 161 deletions(-) create mode 100644 tabular/src/autogluon/tabular/predictor/_deprecated_methods.py diff --git a/docs/tutorials/tabular/tabular-essentials.ipynb b/docs/tutorials/tabular/tabular-essentials.ipynb index 1a61b5045a7..2d0e2107c39 100644 --- a/docs/tutorials/tabular/tabular-essentials.ipynb +++ b/docs/tutorials/tabular/tabular-essentials.ipynb @@ -14,7 +14,9 @@ "\n", "Via a simple `fit()` call, AutoGluon can produce highly-accurate models to predict the values in one column of a data table based on the rest of the columns' values. Use AutoGluon with tabular data for both classification and regression problems. This tutorial demonstrates how to use AutoGluon to produce a classification model that predicts whether or not a person's income exceeds $50,000.\n", "\n", - "To start, import AutoGluon's TabularPredictor and TabularDataset classes:" + "## TabularPredictor\n", + "\n", + "To start, import AutoGluon's [TabularPredictor](../../api/autogluon.tabular.TabularPredictor.rst) and [TabularDataset](../../api/autogluon.core.TabularDataset.rst) classes:" ] }, { @@ -67,7 +69,7 @@ "id": "0ac3f9f5", "metadata": {}, "source": [ - "Note that we loaded data from a CSV file stored in the cloud ([AWS s3 bucket](https://aws.amazon.com/s3/)), but you can you specify a local file-path instead if you have already downloaded the CSV file to your own machine (e.g., using [wget](https://www.gnu.org/software/wget/)).\n", + "Note that we loaded data from a CSV file stored in the cloud. You can also specify a local file-path instead if you have already downloaded the CSV file to your own machine (e.g., using [wget](https://www.gnu.org/software/wget/)).\n", "Each row in the table `train_data` corresponds to a single training example. In this particular dataset, each row corresponds to an individual person, and the columns contain various characteristics reported during a census.\n", "\n", "Let's first use these features to predict whether the person's income exceeds $50,000 or not, which is recorded in the `class` column of this table." @@ -81,7 +83,7 @@ "outputs": [], "source": [ "label = 'class'\n", - "print(\"Summary of class variable: \\n\", train_data[label].describe())" + "print(f\"Unique classes: {list(train_data[label].unique())}\")" ] }, { @@ -89,18 +91,25 @@ "id": "e2808c11", "metadata": {}, "source": [ - "Now use AutoGluon to train multiple models:" + "AutoGluon works with raw data, meaning you don't need to perform any data preprocessing before fitting AutoGluon. We actively recommend that you avoid performing operations such as missing value imputation or one-hot-encoding, as AutoGluon has dedicated logic to handle these situations automatically. You can learn more about AutoGluon's preprocessing in the [Feature Engineering Tutorial](tabular-feature-engineering.ipynb).\n", + "\n", + "### Training\n", + "\n", + "Now we initialize and fit AutoGluon's TabularPredictor in one line of code:" ] }, { "cell_type": "code", "execution_count": null, "id": "93ed52d4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ - "save_path = 'agModels-predictClass' # specifies folder to store trained models\n", - "predictor = TabularPredictor(label=label, path=save_path).fit(train_data)" + "predictor = TabularPredictor(label=label).fit(train_data)" ] }, { @@ -108,6 +117,10 @@ "id": "1088b80f", "metadata": {}, "source": [ + "That's it! We now have a TabularPredictor that is able to make predictions on new data.\n", + "\n", + "### Prediction\n", + "\n", "Next, load separate test data to demonstrate how to make predictions on new examples at inference time:" ] }, @@ -119,9 +132,7 @@ "outputs": [], "source": [ "test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')\n", - "y_test = test_data[label] # values to predict\n", - "test_data_nolab = test_data.drop(columns=[label]) # delete label column to prove we're not cheating\n", - "test_data_nolab.head()" + "test_data.head()" ] }, { @@ -129,13 +140,7 @@ "id": "01bd6e65", "metadata": {}, "source": [ - "We use our trained models to make predictions on the new data and then evaluate performance:\n", - "\n", - "```{warning}\n", - "\n", - "`TabularPredictor.load()` uses `pickle` module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Never load data that could have come from an untrusted source, or that could have been tampered with. **Only load data you trust.**\n", - "\n", - "```" + "We can now use our trained models to make predictions on the new data:" ] }, { @@ -145,19 +150,53 @@ "metadata": {}, "outputs": [], "source": [ - "predictor = TabularPredictor.load(save_path) # unnecessary, just demonstrates how to load previously-trained predictor from file\n", - "\n", - "y_pred = predictor.predict(test_data_nolab)\n", - "print(\"Predictions: \\n\", y_pred)\n", - "perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)" + "y_pred = predictor.predict(test_data)\n", + "y_pred.head() # Predictions" ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "y_pred_proba = predictor.predict_proba(test_data)\n", + "y_pred_proba.head() # Prediction Probabilities" + ], + "metadata": { + "collapsed": false + }, + "id": "1f2ea44baed01439" + }, + { + "cell_type": "markdown", + "source": [ + "### Evaluation\n", + "\n", + "Next, we can [evaluate](../../api/autogluon.tabular.TabularPredictor.evaluate.rst) the predictor on the (labeled) test data:" + ], + "metadata": { + "collapsed": false + }, + "id": "c1ac16b755097c93" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "predictor.evaluate(test_data)" + ], + "metadata": { + "collapsed": false + }, + "id": "ccfb48acf364b609" + }, { "cell_type": "markdown", "id": "ec141019", "metadata": {}, "source": [ - "We can also evaluate the performance of each individual trained model on our (labeled) test data:" + "We can also [evaluate each model individually](../../api/autogluon.tabular.TabularPredictor.leaderboard.rst):" ] }, { @@ -170,6 +209,44 @@ "predictor.leaderboard(test_data)" ] }, + { + "cell_type": "markdown", + "source": [ + "### Loading a Trained Predictor\n", + "\n", + "Finally, we can load the predictor in a new session (or new machine) by calling [TabularPredictor.load()](../../api/autogluon.tabular.TabularPredictor.load.rst) and specifying the location of the predictor artifact on disk." + ], + "metadata": { + "collapsed": false + }, + "id": "ae35bc029d386579" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "predictor.path # The path on disk where the predictor is saved" + ], + "metadata": { + "collapsed": false + }, + "id": "85fcbc65e9dd2cfd" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Load the predictor by specifying the path it is saved to on disk.\n", + "# You can control where it is saved to by setting the `path` parameter during init\n", + "predictor = TabularPredictor.load(predictor.path)" + ], + "metadata": { + "collapsed": false + }, + "id": "3710a0faca8d4af1" + }, { "cell_type": "markdown", "id": "6a32a595", @@ -195,100 +272,128 @@ "id": "255b4558", "metadata": {}, "source": [ - "**Note:** This simple call to `fit()` is intended for your first prototype model. In a subsequent section, we'll demonstrate how to maximize predictive performance by additionally specifying the `presets` parameter to `fit()` and the `eval_metric` parameter to `TabularPredictor()`.\n", + "**Note:** This simple call to [TabularPredictor.fit()](../../api/autogluon.tabular.TabularPredictor.fit.rst) is intended for your first prototype model. In a subsequent section, we'll demonstrate how to maximize predictive performance by additionally specifying the `presets` parameter to `fit()` and the `eval_metric` parameter to `TabularPredictor()`.\n", "\n", - "## Description of fit():\n", + "## Description of fit()\n", "\n", "Here we discuss what happened during `fit()`.\n", "\n", "Since there are only two possible values of the `class` variable, this was a binary classification problem, for which an appropriate performance metric is _accuracy_. AutoGluon automatically infers this as well as the type of each feature (i.e., which columns contain continuous numbers vs. discrete categories). AutoGluon can also automatically handle common issues like missing data and rescaling feature values.\n", "\n", - "We did not specify separate validation data and so AutoGluon automatically choses a random training/validation split of the data. The data used for validation is separated from the training data and is used to determine the models and hyperparameter-values that produce the best results. Rather than just a single model, AutoGluon trains multiple models and ensembles them together to ensure superior predictive performance.\n", + "We did not specify separate validation data and so AutoGluon automatically chose a random training/validation split of the data. The data used for validation is separated from the training data and is used to determine the models and hyperparameter-values that produce the best results. Rather than just a single model, AutoGluon trains multiple models and ensembles them together to obtain superior predictive performance.\n", "\n", - "By default, AutoGluon tries to fit various types of models including neural networks and tree ensembles. Each type of model has various hyperparameters, which traditionally, the user would have to specify.\n", - "AutoGluon automates this process.\n", + "By default, AutoGluon tries to fit [various types of models](../../api/autogluon.tabular.models.rst) including neural networks and tree ensembles. Each type of model has various hyperparameters, which traditionally, the user would have to specify. AutoGluon automates this process.\n", "\n", - "AutoGluon automatically and iteratively tests values for hyperparameters to produce the best performance on the validation data. This involves repeatedly training models under different hyperparameter settings and evaluating their performance. This process can be computationally-intensive, so `fit()` can parallelize this process across multiple threads (and machines if distributed resources are available). To control runtimes, you can specify various arguments in `fit()` as demonstrated in the subsequent **In-Depth** tutorial.\n", - "\n", - "For tabular problems, `fit()` returns a `Predictor` object. For classification, you can easily output predicted class probabilities instead of predicted classes:" + "AutoGluon automatically and iteratively tests values for hyperparameters to produce the best performance on the validation data. This involves repeatedly training models under different hyperparameter settings and evaluating their performance. This process can be computationally-intensive, so `fit()` parallelizes this process across multiple threads using [Ray](https://www.ray.io/). To control runtimes, you can specify various arguments in `fit()` such as `time_limit` as demonstrated in the subsequent **[In-Depth Tutorial](tabular-indepth.ipynb)**." + ] + }, + { + "cell_type": "markdown", + "id": "75f84eca", + "metadata": {}, + "source": [ + "We can view what properties AutoGluon automatically inferred about our prediction task:" ] }, { "cell_type": "code", "execution_count": null, - "id": "1b4ec4d8", + "id": "f4074d3a", "metadata": {}, "outputs": [], "source": [ - "pred_probs = predictor.predict_proba(test_data_nolab)\n", - "pred_probs.head(5)" + "print(\"AutoGluon infers problem type is: \", predictor.problem_type)\n", + "print(\"AutoGluon identified the following types of features:\")\n", + "print(predictor.feature_metadata)" ] }, { "cell_type": "markdown", - "id": "ce5f8671", + "id": "14fde02c", "metadata": {}, "source": [ - "Besides inference, this object can also summarize what happened during fit." + "AutoGluon correctly recognized our prediction problem to be a **binary classification** task and decided that variables such as `age` should be represented as integers, whereas variables such as `workclass` should be represented as categorical objects. The `feature_metadata` attribute allows you to see the inferred data type of each predictive variable after preprocessing (this is its _raw_ dtype; some features may also be associated with additional _special_ dtypes if produced via feature-engineering, e.g. numerical representations of a datetime/text column)." ] }, + { + "cell_type": "markdown", + "source": [ + "To transform the data into AutoGluon's internal representation, we can do the following:" + ], + "metadata": { + "collapsed": false + }, + "id": "27f0ef525a7db211" + }, { "cell_type": "code", "execution_count": null, - "id": "b40922c2", - "metadata": {}, "outputs": [], "source": [ - "results = predictor.fit_summary(show_plot=True)" - ] + "test_data_transform = predictor.transform_features(test_data)\n", + "test_data_transform.head()" + ], + "metadata": { + "collapsed": false + }, + "id": "addae3bd40b4318a" }, { "cell_type": "markdown", - "id": "75f84eca", - "metadata": {}, "source": [ - "From this summary, we can see that AutoGluon trained many different types of models as well as an ensemble of the best-performing models. The summary also describes the actual models that were trained during fit and how well each model performed on the held-out validation data. We can view what properties AutoGluon automatically inferred about our prediction task:" - ] + "Notice how the data is purely numeric after pre-processing (although categorical features will still be treated as categorical downstream).\n", + "\n", + "To better understand our trained predictor, we can estimate the overall importance of each feature via [TabularPredictor.feature_importance()](../../api/autogluon.tabular.TabularPredictor.feature_importance.rst):" + ], + "metadata": { + "collapsed": false + }, + "id": "5a608ba782bef998" }, { "cell_type": "code", "execution_count": null, - "id": "f4074d3a", - "metadata": {}, "outputs": [], "source": [ - "print(\"AutoGluon infers problem type is: \", predictor.problem_type)\n", - "print(\"AutoGluon identified the following types of features:\")\n", - "print(predictor.feature_metadata)" - ] + "predictor.feature_importance(test_data)" + ], + "metadata": { + "collapsed": false + }, + "id": "567ebed45b3ba83c" }, { "cell_type": "markdown", - "id": "14fde02c", + "id": "ef4f97a2", "metadata": {}, "source": [ - "AutoGluon correctly recognized our prediction problem to be a **binary classification** task and decided that variables such as `age` should be represented as integers, whereas variables such as `workclass` should be represented as categorical objects. The `feature_metadata` attribute allows you to see the inferred data type of each predictive variable after preprocessing (this is its _raw_ dtype; some features may also be associated with additional _special_ dtypes if produced via feature-engineering, e.g. numerical representations of a datetime/text column).\n", + "The `importance` column is an estimate for the amount the evaluation metric score would drop if the feature were removed from the data.\n", + "Negative values of `importance` mean that it is likely to improve the results if re-fit with the feature removed.\n", "\n", - "We can evaluate the performance of each individual trained model on our (labeled) test data:" + "When we call `predict()`, AutoGluon automatically predicts with the model that displayed the best performance on validation data (i.e. the weighted-ensemble)." ] }, { "cell_type": "code", "execution_count": null, - "id": "ae442e45", - "metadata": {}, "outputs": [], "source": [ - "predictor.leaderboard(test_data)" - ] + "predictor.model_best" + ], + "metadata": { + "collapsed": false + }, + "id": "79066cd8f9a34ee8" }, { "cell_type": "markdown", - "id": "ef4f97a2", - "metadata": {}, "source": [ - "When we call `predict()`, AutoGluon automatically predicts with the model that displayed the best performance on validation data (i.e. the weighted-ensemble). We can instead specify which model to use for predictions like this:" - ] + "We can instead specify which model to use for predictions like this:" + ], + "metadata": { + "collapsed": false + }, + "id": "fb0ca088eaf1e452" }, { "cell_type": "markdown", @@ -297,25 +402,39 @@ "source": [ "```\n", "predictor.predict(test_data, model='LightGBM')\n", - "```\n" + "```\n", + "\n", + "You can get the list of trained models via `.leaderboard()` or `.model_names()`:" ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "predictor.model_names()" + ], + "metadata": { + "collapsed": false + }, + "id": "6eaa4acf8afdf20a" + }, { "cell_type": "markdown", "id": "d0a30ee6", "metadata": {}, "source": [ - "Above the scores of predictive performance were based on a default evaluation metric (accuracy for binary classification). Performance in certain applications may be measured by different metrics than the ones AutoGluon optimizes for by default. If you know the metric that counts in your application, you should specify it as demonstrated in the next section.\n", + "The scores of predictive performance above were based on a default evaluation metric (accuracy for binary classification). Performance in certain applications may be measured by different metrics than the ones AutoGluon optimizes for by default. If you know the metric that counts in your application, you should specify it via the `eval_metric` argument as demonstrated in the next section.\n", "\n", "## Presets\n", "\n", "AutoGluon comes with a variety of presets that can be specified in the call to `.fit` via the `presets` argument. `medium_quality` is used by default to encourage initial prototyping, but for serious usage, the other presets should be used instead.\n", "\n", "| Preset | Model Quality | Use Cases | Fit Time (Ideal) | Inference Time (Relative to medium_quality) | Disk Usage |\n", - "| :------------- | :----------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------ | :--------------- | :------------------------------------------ | :--------- |\n", + "| :------------- |:-------------------------------------------------------| :------------------------------------------------------------------------------------------------------------------------------------------------------ |:-----------------| :------------------------------------------ | :--------- |\n", "| best_quality | State-of-the-art (SOTA), much better than high_quality | When accuracy is what matters | 16x+ | 32x+ | 16x+ |\n", - "| high_quality | Better than good_quality | When a very powerful, portable solution with fast inference is required: Large-scale batch inference | 16x | 4x | 2x |\n", - "| good_quality | Significantly better than medium_quality | When a powerful, highly portable solution with very fast inference is required: Billion-scale batch inference, sub-100ms online-inference, edge-devices | 16x | 2x | 0.1x |\n", + "| high_quality | Better than good_quality | When a very powerful, portable solution with fast inference is required: Large-scale batch inference | 16x+ | 4x | 2x |\n", + "| good_quality | Stronger than any other AutoML Framework | When a powerful, highly portable solution with very fast inference is required: Billion-scale batch inference, sub-100ms online-inference, edge-devices | 16x | 2x | 0.1x |\n", "| medium_quality | Competitive with other top AutoML Frameworks | Initial prototyping, establishing a performance baseline | 1x | 1x | 1x |\n", "\n", "We recommend users to start with `medium_quality` to get a sense of the problem and identify any data related issues. If `medium_quality` is taking too long to train, consider subsampling the training data during this prototyping phase. \n", @@ -334,15 +453,30 @@ "cell_type": "code", "execution_count": null, "id": "358b121a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "time_limit = 60 # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)\n", "metric = 'roc_auc' # specify your evaluation metric here\n", - "predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')\n", - "predictor.leaderboard(test_data)" + "predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')" ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "predictor.leaderboard(test_data)" + ], + "metadata": { + "collapsed": false + }, + "id": "b45474df26853911" + }, { "cell_type": "markdown", "id": "cf8a57a7", @@ -352,13 +486,13 @@ "\n", "- Specify the argument `presets='best_quality'`, which allows AutoGluon to automatically construct powerful model ensembles based on [stacking/bagging](https://arxiv.org/abs/2003.06505), and will greatly improve the resulting predictions if granted sufficient training time. The default value of `presets` is `'medium_quality'`, which produces _less_ accurate models but facilitates faster prototyping. With `presets`, you can flexibly prioritize predictive accuracy vs. training/inference speed. For example, if you care less about predictive performance and want to quickly deploy a basic model, consider using: `presets=['good_quality', 'optimize_for_deployment']`.\n", "\n", - "- Provide the parameter `eval_metric` to `TabularPredictor()` if you know what metric will be used to evaluate predictions in your application. Some other non-default metrics you might use include things like: `'f1'` (for binary classification), `'roc_auc'` (for binary classification), `'log_loss'` (for classification), `'mean_absolute_error'` (for regression), `'median_absolute_error'` (for regression). You can also define your own custom metric function. For more information refer to [Adding a custom metric to AutoGluon](advanced/tabular-custom-metric.ipynb)\n", + "- Provide the parameter `eval_metric` to `TabularPredictor()` if you know what metric will be used to evaluate predictions in your application. Some other non-default metrics you might use include things like: `'f1'` (for binary classification), `'roc_auc'` (for binary classification), `'log_loss'` (for classification), `'mean_absolute_error'` (for regression), `'median_absolute_error'` (for regression). You can also define your own custom metric function. For more information refer to [Adding a custom metric to AutoGluon](advanced/tabular-custom-metric.ipynb).\n", "\n", "- Include all your data in `train_data` and do not provide `tuning_data` (AutoGluon will split the data more intelligently to fit its needs).\n", "\n", "- Do not specify the `hyperparameter_tune_kwargs` argument (counterintuitively, hyperparameter tuning is not the best way to spend a limited training time budgets, as model ensembling is often superior). We recommend you only use `hyperparameter_tune_kwargs` if your goal is to deploy a single model rather than an ensemble.\n", "\n", - "- Do not specify `hyperparameters` argument (allow AutoGluon to adaptively select which models/hyperparameters to use).\n", + "- Do not specify the `hyperparameters` argument (allow AutoGluon to adaptively select which models/hyperparameters to use).\n", "\n", "- Set `time_limit` to the longest amount of time (in seconds) that you are willing to wait. AutoGluon's predictive performance improves the longer `fit()` is allowed to run.\n", "\n", @@ -375,7 +509,7 @@ "outputs": [], "source": [ "age_column = 'age'\n", - "print(\"Summary of age variable: \\n\", train_data[age_column].describe())" + "train_data[age_column].head()" ] }, { @@ -390,19 +524,34 @@ "cell_type": "code", "execution_count": null, "id": "36e8f913", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ - "predictor_age = TabularPredictor(label=age_column, path=\"agModels-predictAge\").fit(train_data, time_limit=60)\n", - "performance = predictor_age.evaluate(test_data)" + "predictor_age = TabularPredictor(label=age_column, path=\"agModels-predictAge\").fit(train_data, time_limit=60)" ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "predictor_age.evaluate(test_data)" + ], + "metadata": { + "collapsed": false + }, + "id": "d4564a06d1766c76" + }, { "cell_type": "markdown", "id": "46af4e18", "metadata": {}, "source": [ - "Note that we didn't need to tell AutoGluon this is a regression problem, it automatically inferred this from the data and reported the appropriate performance metric (RMSE by default). To specify a particular evaluation metric other than the default, set the `eval_metric` parameter of `TabularPredictor()` and AutoGluon will tailor its models to optimize your metric (e.g. `eval_metric = 'mean_absolute_error'`). For evaluation metrics where higher values are worse (like RMSE), AutoGluon will flip their sign and print them as negative values during training (as it internally assumes higher values are better).\n", + "Note that we didn't need to tell AutoGluon this is a regression problem, it automatically inferred this from the data and reported the appropriate performance metric (RMSE by default). To specify a particular evaluation metric other than the default, set the `eval_metric` parameter of [TabularPredictor()](../../api/autogluon.tabular.TabularPredictor.rst) and AutoGluon will tailor its models to optimize your metric (e.g. `eval_metric = 'mean_absolute_error'`). For evaluation metrics where higher values are worse (like RMSE), AutoGluon will flip their sign and print them as negative values during training (as it internally assumes higher values are better). You can even specify a custom metric by following the [Custom Metric Tutorial](advanced/tabular-custom-metric.ipynb).\n", "\n", "We can call leaderboard to see the per-model performance:" ] @@ -422,15 +571,17 @@ "id": "9d692ceb", "metadata": {}, "source": [ - "**Data Formats:** AutoGluon can currently operate on data tables already loaded into Python as pandas DataFrames, or those stored in files of [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) or [Parquet format](https://databricks.com/glossary/what-is-parquet). If your data live in multiple tables, you will first need to join them into a single table whose rows correspond to statistically independent observations (datapoints) and columns correspond to different features (aka. variables/covariates).\n", + "**Data Formats:** AutoGluon can currently operate on data tables already loaded into Python as pandas DataFrames, or those stored in files of [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) or [Parquet format](https://databricks.com/glossary/what-is-parquet). If your data lives in multiple tables, you will first need to join them into a single table whose rows correspond to statistically independent observations (datapoints) and columns correspond to different features (aka. variables/covariates).\n", "\n", "Refer to the [TabularPredictor documentation](../../api/autogluon.tabular.TabularPredictor.rst) to see all of the available methods/options.\n", "\n", "## Advanced Usage\n", "\n", - "For more advanced usage examples of AutoGluon, refer to [Predicting Columns in a Table - In Depth](tabular-indepth.ipynb)\n", + "For more advanced usage examples of AutoGluon, refer to the [In Depth Tutorial](tabular-indepth.ipynb)\n", + "\n", + "If you are interested in deployment optimization, refer to the [Deployment Optimization Tutorial](advanced/tabular-deployment.ipynb).\n", "\n", - "If you are interested in deployment optimization, refer to the [Predicting Columns in a Table - Deployment Optimization](advanced/tabular-deployment.ipynb) tutorial." + "For adding custom models to AutoGluon, refer to the [Custom Model](advanced/tabular-custom-model.ipynb) and [Custom Model Advanced](advanced/tabular-custom-model-advanced.ipynb) tutorials." ] } ], diff --git a/docs/tutorials/tabular/tabular-indepth.ipynb b/docs/tutorials/tabular/tabular-indepth.ipynb index 2fe1782508b..4d58804fdba 100644 --- a/docs/tutorials/tabular/tabular-indepth.ipynb +++ b/docs/tutorials/tabular/tabular-indepth.ipynb @@ -60,13 +60,12 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "98733672", - "metadata": {}, "source": [ "## Specifying hyperparameters and tuning them\n", "\n", + "**Note: We don't recommend doing hyperparameter-tuning with AutoGluon in most cases**. AutoGluon achieves its best performance without hyperparameter tuning and simply specifying `presets=\"best_quality\"`.\n", + "\n", "We first demonstrate hyperparameter-tuning and how you can provide your own validation dataset that AutoGluon internally relies on to: tune hyperparameters, early-stop iterative training, and construct model ensembles. One reason you may specify validation data is when future test data will stem from a different distribution than training data (and your specified validation data is more representative of the future data that will likely be encountered).\n", "\n", " If you don't have a strong reason to provide your own validation dataset, we recommend you omit the `tuning_data` argument. This lets AutoGluon automatically select validation data from your provided training set (it uses smart strategies such as stratified sampling). For greater control, you can specify the `holdout_frac` argument to tell AutoGluon what fraction of the provided training data to hold out for validation.\n", @@ -76,13 +75,15 @@ "`fit()` trains neural networks and various types of tree ensembles by default. You can specify various hyperparameter values for each type of model. For each hyperparameter, you can either specify a single fixed value, or a search space of values to consider during hyperparameter optimization. Hyperparameters which you do not specify are left at default settings chosen automatically by AutoGluon, which may be fixed values or search spaces.\n", "\n", "Refer to the [Search Space documentation](../../api/autogluon.common.space.rst) to learn more about AutoGluon search space." - ] + ], + "metadata": { + "collapsed": false + }, + "id": "98733672" }, { "cell_type": "code", "execution_count": null, - "id": "87f28cf4", - "metadata": {}, "outputs": [], "source": [ "from autogluon.common import space\n", @@ -120,58 +121,79 @@ " hyperparameters=hyperparameters,\n", " hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,\n", ")" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "87f28cf4" }, { "cell_type": "markdown", - "id": "816e4beb", - "metadata": {}, "source": [ "We again demonstrate how to use the trained models to predict on the test data." - ] + ], + "metadata": { + "collapsed": false + }, + "id": "816e4beb" }, { "cell_type": "code", "execution_count": null, - "id": "3bf2965a", - "metadata": {}, "outputs": [], "source": [ "y_pred = predictor.predict(test_data_nolabel)\n", "print(\"Predictions: \", list(y_pred)[:5])\n", "perf = predictor.evaluate(test_data, auxiliary_metrics=False)" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "3bf2965a" }, { "cell_type": "markdown", - "id": "5c2f4648", - "metadata": {}, "source": [ "Use the following to view a summary of what happened during `fit()`. Now this command will show details of the hyperparameter-tuning process for each type of model:" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "5c2f4648" }, { "cell_type": "code", "execution_count": null, - "id": "1bfc4fe3", - "metadata": {}, "outputs": [], "source": [ "results = predictor.fit_summary()" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "1bfc4fe3" + }, + { + "cell_type": "markdown", + "source": [ + "In the above example, the predictive performance may be poor because we specified very little training to ensure quick runtimes. You can call `fit()` multiple times while modifying the above settings to better understand how these choices affect performance outcomes. For example: you can comment out the `train_data.head` command or increase `subsample_size` to train using a larger dataset, increase the `num_epochs` and `num_boost_round` hyperparameters, and increase the `time_limit` (which you should do for all code in these tutorials). To see more detailed output during the execution of `fit()`, you can also pass in the argument: `verbosity = 3`." + ], + "metadata": { + "collapsed": false + }, + "id": "1d06b7ab" }, { "cell_type": "markdown", - "id": "1d06b7ab", - "metadata": {}, "source": [ - "In the above example, the predictive performance may be poor because we specified very little training to ensure quick runtimes. You can call `fit()` multiple times while modifying the above settings to better understand how these choices affect performance outcomes. For example: you can comment out the `train_data.head` command or increase `subsample_size` to train using a larger dataset, increase the `num_epochs` and `num_boost_round` hyperparameters, and increase the `time_limit` (which you should do for all code in these tutorials). To see more detailed output during the execution of `fit()`, you can also pass in the argument: `verbosity = 3`.\n", - "\n", - "\n", "## Model ensembling with stacking/bagging\n", "\n", "Beyond hyperparameter-tuning with a correctly-specified evaluation metric, two other methods to boost predictive performance are [bagging and stack-ensembling](https://arxiv.org/abs/2003.06505). You'll often see performance improve if you specify `num_bag_folds` = 5-10, `num_stack_levels` = 1-3 in the call to `fit()`, but this will increase training times and memory/disk usage." - ] + ], + "metadata": { + "collapsed": false + }, + "id": "cc894bfde6cbc5f1" }, { "cell_type": "code", diff --git a/tabular/src/autogluon/tabular/predictor/_deprecated_methods.py b/tabular/src/autogluon/tabular/predictor/_deprecated_methods.py new file mode 100644 index 00000000000..985166e13ed --- /dev/null +++ b/tabular/src/autogluon/tabular/predictor/_deprecated_methods.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import Dict, List + +import numpy as np +import pandas as pd + +from autogluon.common.utils import Deprecated + + +class TabularPredictorDeprecatedMixin: + """Contains deprecated methods from TabularPredictor that shouldn't show up in API documentation.""" + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="persist") + def persist_models(self, *args, **kwargs) -> List[str]: + """Deprecated method. Use `persist` instead.""" + return self.persist(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="unpersist") + def unpersist_models(self, *args, **kwargs) -> List[str]: + """Deprecated method. Use `unpersist` instead.""" + return self.unpersist(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_names") + def get_model_names(self, *args, **kwargs) -> List[str]: + """Deprecated method. Use `model_names` instead.""" + return self.model_names(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_best") + def get_model_best(self) -> str: + """Deprecated method. Use `model_best` instead.""" + return self.model_best + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_from_proba") + def get_pred_from_proba(self, *args, **kwargs) -> pd.Series | np.array: + """Deprecated method. Use `predict_from_proba` instead.""" + return self.predict_from_proba(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_refit_map") + def get_model_full_dict(self, *args, **kwargs) -> Dict[str, str]: + """Deprecated method. Use `model_refit_map` instead.""" + return self.model_refit_map(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_proba_oof") + def get_oof_pred_proba(self, *args, **kwargs) -> pd.DataFrame | pd.Series: + """Deprecated method. Use `predict_proba_oof` instead.""" + return self.predict_proba_oof(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_oof") + def get_oof_pred(self, *args, **kwargs) -> pd.Series: + """Deprecated method. Use `predict_oof` instead.""" + return self.predict_oof(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="disk_usage_per_file") + def get_size_disk_per_file(self, *args, **kwargs) -> pd.Series: + """Deprecated method. Use `disk_usage_per_file` instead.""" + return self.disk_usage_per_file(*args, **kwargs) + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="disk_usage") + def get_size_disk(self) -> int: + """Deprecated method. Use `disk_usage` instead.""" + return self.disk_usage() + + @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_names(persisted=True)") + def get_model_names_persisted(self) -> List[str]: + """Deprecated method. Use `model_names(persisted=True)` instead.""" + return self.model_names(persisted=True) diff --git a/tabular/src/autogluon/tabular/predictor/predictor.py b/tabular/src/autogluon/tabular/predictor/predictor.py index 3e2a2a3b5ab..8976157816e 100644 --- a/tabular/src/autogluon/tabular/predictor/predictor.py +++ b/tabular/src/autogluon/tabular/predictor/predictor.py @@ -16,7 +16,6 @@ from autogluon.common.loaders import load_json from autogluon.common.savers import save_json -from autogluon.common.utils import Deprecated from autogluon.common.utils.file_utils import get_directory_size, get_directory_size_per_file from autogluon.common.utils.log_utils import add_log_to_file, set_logger_verbosity from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage @@ -62,6 +61,7 @@ from ..configs.presets_configs import tabular_presets_alias, tabular_presets_dict from ..learner import AbstractTabularLearner, DefaultLearner from ..trainer.model_presets.presets import MODEL_TYPES +from ._deprecated_methods import TabularPredictorDeprecatedMixin logger = logging.getLogger(__name__) # return autogluon root logger @@ -74,7 +74,7 @@ # TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation. # TODO: Resolve raw text feature usage in default feature generator # TODO: num_bag_sets -> ag_args -class TabularPredictor: +class TabularPredictor(TabularPredictorDeprecatedMixin): """ AutoGluon TabularPredictor predicts values in a column of a tabular dataset (classification or regression). @@ -4741,61 +4741,6 @@ def _assert_is_fit(self, message_suffix: str = None): error_message = f"{error_message} `.{message_suffix}`." raise AssertionError(error_message) - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="persist") - def persist_models(self, *args, **kwargs) -> List[str]: - """Deprecated method. Use `persist` instead.""" - return self.persist(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="unpersist") - def unpersist_models(self, *args, **kwargs) -> List[str]: - """Deprecated method. Use `unpersist` instead.""" - return self.unpersist(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_names") - def get_model_names(self, *args, **kwargs) -> List[str]: - """Deprecated method. Use `model_names` instead.""" - return self.model_names(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_best") - def get_model_best(self) -> str: - """Deprecated method. Use `model_best` instead.""" - return self.model_best - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_from_proba") - def get_pred_from_proba(self, *args, **kwargs) -> pd.Series | np.array: - """Deprecated method. Use `predict_from_proba` instead.""" - return self.predict_from_proba(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_refit_map") - def get_model_full_dict(self, *args, **kwargs) -> Dict[str, str]: - """Deprecated method. Use `model_refit_map` instead.""" - return self.model_refit_map(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_proba_oof") - def get_oof_pred_proba(self, *args, **kwargs) -> pd.DataFrame | pd.Series: - """Deprecated method. Use `predict_proba_oof` instead.""" - return self.predict_proba_oof(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="predict_oof") - def get_oof_pred(self, *args, **kwargs) -> pd.Series: - """Deprecated method. Use `predict_oof` instead.""" - return self.predict_oof(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="disk_usage_per_file") - def get_size_disk_per_file(self, *args, **kwargs) -> pd.Series: - """Deprecated method. Use `disk_usage_per_file` instead.""" - return self.disk_usage_per_file(*args, **kwargs) - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="disk_usage") - def get_size_disk(self) -> int: - """Deprecated method. Use `disk_usage` instead.""" - return self.disk_usage() - - @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_names(persisted=True)") - def get_model_names_persisted(self) -> List[str]: - """Deprecated method. Use `model_names(persisted=True)` instead.""" - return self.model_names(persisted=True) - # Location to store WIP functionality that will be later added to TabularPredictor class _TabularPredictorExperimental(TabularPredictor):