From 69e7b116e6d2c4b7b320fbbf3690353125f16039 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 20 Aug 2021 12:18:59 -0400 Subject: [PATCH 01/62] init --- Makefile | 1 + evalml/__init__.py | 1 + evalml/__main__.py | 4 ++ evalml/demos/__init__.py | 1 + evalml/demos/breast_cancer.py | 1 + evalml/demos/churn.py | 4 +- evalml/demos/diabetes.py | 4 +- evalml/demos/fraud.py | 4 +- evalml/demos/wine.py | 1 + evalml/model_understanding/graphs.py | 49 ++++++++++--------- .../permutation_importance.py | 1 + 11 files changed, 43 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index a8eef9198c..f8fda8aa0d 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions + pydocstyle evalml black evalml -t py39 --check .PHONY: lint-fix diff --git a/evalml/__init__.py b/evalml/__init__.py index ab9aaefdaf..e821828fc8 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -1,3 +1,4 @@ +"""This is a docstring.""" import warnings # hack to prevent warnings from skopt diff --git a/evalml/__main__.py b/evalml/__main__.py index 3ac984578e..d92173eb11 100644 --- a/evalml/__main__.py +++ b/evalml/__main__.py @@ -1,3 +1,5 @@ +"""I'm a docstring.""" + import click from evalml.utils.cli_utils import print_info @@ -5,11 +7,13 @@ @click.group() def cli(): + """I'm a docstring.""" pass @click.command() def info(): + """I'm a docstring.""" print_info() diff --git a/evalml/demos/__init__.py b/evalml/demos/__init__.py index a14b52b6ed..b4b7f4a3b9 100644 --- a/evalml/demos/__init__.py +++ b/evalml/demos/__init__.py @@ -1,3 +1,4 @@ +"""Demo datasets.""" from .breast_cancer import load_breast_cancer from .diabetes import load_diabetes from .fraud import load_fraud diff --git a/evalml/demos/breast_cancer.py b/evalml/demos/breast_cancer.py index aa16aa8068..fd83efdbd6 100644 --- a/evalml/demos/breast_cancer.py +++ b/evalml/demos/breast_cancer.py @@ -1,3 +1,4 @@ +"""Load the breast cancer dataset, which can be used for binary classification problems.""" import woodwork as ww import evalml diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index 33404d3fa5..05336a659c 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -1,10 +1,10 @@ +"""Load the churn dataset, which can be used for binary classification problems.""" import evalml from evalml.preprocessing import load_data def load_churn(n_rows=None, verbose=True): - """Load credit card fraud dataset. - The fraud dataset can be used for binary classification problems. + """Load churn dataset, which can be used for binary classification problems. Arguments: n_rows (int): Number of rows from the dataset to return diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index fdc88699e3..9642b28840 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -1,3 +1,5 @@ +"""Load the diabetes dataset, which can be used for regression problems.""" + import woodwork as ww import evalml @@ -5,7 +7,7 @@ def load_diabetes(): - """Load diabetes dataset. Regression problem + """Load diabetes dataset. Used for regression problem. Returns: (pd.Dataframe, pd.Series): X and y diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index 5b006521a0..bc56e72a3c 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -1,10 +1,12 @@ +"""Load the credit card fraud dataset, which can be used for binary classification problems.""" import evalml from evalml.preprocessing import load_data def load_fraud(n_rows=None, verbose=True): """Load credit card fraud dataset. - The fraud dataset can be used for binary classification problems. + + The fraud dataset can be used for binary classification problems. Arguments: n_rows (int): Number of rows from the dataset to return diff --git a/evalml/demos/wine.py b/evalml/demos/wine.py index b439dc7484..e9fee6dba3 100644 --- a/evalml/demos/wine.py +++ b/evalml/demos/wine.py @@ -1,3 +1,4 @@ +"""Load and return the wine dataset, which can be used for multiclass classification problems.""" import woodwork as ww import evalml diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 01c45e6340..d3557c012f 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1,3 +1,5 @@ +"""I'm a docstring.""" + import copy import os import warnings @@ -63,7 +65,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): def normalize_confusion_matrix(conf_mat, normalize_method="true"): - """Normalizes a confusion matrix. + """Normalize a confusion matrix. Arguments: conf_mat (pd.DataFrame or np.ndarray): Confusion matrix to normalize. @@ -426,8 +428,7 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): - """Computes objective score as a function of potential binary classification - decision thresholds for a fitted binary classification pipeline. + """Compute objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. Arguments: pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline @@ -438,7 +439,6 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): Returns: pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold - """ objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(ProblemTypes.BINARY): @@ -460,7 +460,7 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): - """Generates a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. + """Generate a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. Arguments: pipeline (PipelineBase or subclass): Fitted pipeline @@ -506,7 +506,7 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): """If the user is doing a two-way partial dependence plot and one of the features is categorical, - we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. + we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the type of the first feature in the tuple. If we put the categorical feature first, the grid will be of type 'object' @@ -529,7 +529,7 @@ def _get_feature_names_from_str_or_col_index(X, names_or_col_indices): def _raise_value_error_if_any_features_all_nan(df): - """Helper for partial dependence data validation.""" + """Validate partial dependence data by checking if any features have all NaN values.""" nan_pct = df.isna().mean() all_nan = nan_pct[nan_pct == 1].index.tolist() all_nan = [f"'{name}'" for name in all_nan] @@ -543,7 +543,7 @@ def _raise_value_error_if_any_features_all_nan(df): def _raise_value_error_if_mostly_one_value(df, percentile): - """Helper for partial dependence data validation.""" + """Validate partial dependence data by checking if features are mostly one value.""" one_value = [] values = [] @@ -565,9 +565,10 @@ def _raise_value_error_if_mostly_one_value(df, percentile): def partial_dependence( pipeline, X, features, percentiles=(0.05, 0.95), grid_resolution=100, kind="average" ): - """Calculates one or two-way partial dependence. If a single integer or - string is given for features, one-way partial dependence is calculated. If - a tuple of two integers or strings is given, two-way partial dependence + """Calculate one or two-way partial dependence. + + If a single integer or string is given for features, one-way partial dependence is calculated. + If a tuple of two integers or strings is given, two-way partial dependence is calculated with the first feature in the y-axis and second feature in the x-axis. @@ -932,16 +933,16 @@ def _update_fig_with_two_way_partial_dependence( def graph_partial_dependence( pipeline, X, features, class_label=None, grid_resolution=100, kind="average" ): - """Create an one-way or two-way partial dependence plot. Passing a single integer or - string as features will create a one-way partial dependence plot with the feature values - plotted against the partial dependence. Passing features a tuple of int/strings will create - a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] - in the x-axis and the partial dependence in the z-axis. + """Create an one-way or two-way partial dependence plot. Passing a single integer or + string as features will create a one-way partial dependence plot with the feature values + plotted against the partial dependence. Passing features a tuple of int/strings will create + a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] + in the x-axis and the partial dependence in the z-axis. Arguments: - pipeline (PipelineBase or subclass): Fitted pipeline + pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at + for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. @@ -1171,7 +1172,7 @@ def _add_ice_plot(_go, fig, ice_data, label=None, row=None, col=None): def _calculate_axis_range(arr): - """Helper method to help calculate the appropriate range for an axis based on the data to graph.""" + """A helper method to help calculate the appropriate range for an axis based on the data to graph.""" max_value = arr.max() min_value = arr.min() margins = abs(max_value - min_value) * 0.05 @@ -1179,7 +1180,7 @@ def _calculate_axis_range(arr): def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): - """Combines y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. + """Combine y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. Arguments: y_true (pd.Series, or np.ndarray): The real target values of the data @@ -1326,13 +1327,13 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): - """Return data for a fitted pipeline with in a restructured format + """Return data for a fitted pipeline with in a restructured format. Arguments: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1352,7 +1353,7 @@ def decision_tree_data_from_pipeline(pipeline_): def visualize_decision_tree( estimator, max_depth=None, rotate=False, filled=False, filepath=None ): - """Generate an image visualizing the decision tree + """Generate an image visualizing the decision tree. Arguments: estimator (ComponentBase): A fitted DecisionTree-based estimator. @@ -1504,7 +1505,7 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): def get_linear_coefficients(estimator, features=None): - """Returns a dataframe showing the features with the greatest predictive power for a linear model. + """Return a dataframe showing the features with the greatest predictive power for a linear model. Arguments: estimator (Estimator): Fitted linear model family estimator. diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index a2420e63f9..5f2c50dc77 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -1,3 +1,4 @@ +"""I'm a docstring.""" import numpy as np import pandas as pd from joblib import Parallel, delayed From 82f358c0c53181cfea1b37faa4c4edf3d8b278cb Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 20 Aug 2021 12:32:27 -0400 Subject: [PATCH 02/62] add auto --- Makefile | 1 + dev-requirements.txt | 3 +- docs/notebook_version_standardizer.py | 2 +- docs/source/conf.py | 10 ++---- .../automl_algorithm/automl_algorithm.py | 9 +++-- .../automl_algorithm/evalml_algorithm.py | 2 +- .../automl_algorithm/iterative_algorithm.py | 7 ++-- evalml/automl/automl_search.py | 16 ++++----- evalml/automl/callbacks.py | 10 ++++-- evalml/automl/engine/cf_engine.py | 5 ++- evalml/automl/engine/dask_engine.py | 5 ++- evalml/automl/engine/engine_base.py | 15 ++++---- evalml/automl/engine/sequential_engine.py | 8 +++-- evalml/automl/utils.py | 5 ++- .../data_checks/class_imbalance_data_check.py | 3 +- evalml/data_checks/data_check.py | 9 +++-- evalml/data_checks/data_check_action.py | 6 +++- evalml/data_checks/data_check_message.py | 6 +++- evalml/data_checks/data_checks.py | 7 ++-- .../data_checks/datetime_format_data_check.py | 4 +-- evalml/data_checks/highly_null_data_check.py | 2 -- .../multicollinearity_data_check.py | 1 - evalml/data_checks/outliers_data_check.py | 24 +++---------- evalml/data_checks/sparsity_data_check.py | 6 ++-- .../target_distribution_data_check.py | 3 +- .../data_checks/target_leakage_data_check.py | 3 +- evalml/data_checks/uniqueness_data_check.py | 10 ++---- evalml/data_checks/utils.py | 3 +- evalml/exceptions/exceptions.py | 6 ++-- evalml/model_family/utils.py | 2 +- evalml/model_understanding/graphs.py | 35 +++++++------------ .../permutation_importance.py | 7 ++-- .../_user_interface.py | 9 ++--- .../prediction_explanations/explainers.py | 9 +++-- .../binary_classification_objective.py | 6 +++- evalml/objectives/cost_benefit_matrix.py | 5 +-- evalml/objectives/objective_base.py | 21 ++++++++--- evalml/objectives/sensitivity_low_alert.py | 7 ++-- evalml/objectives/standard_metrics.py | 21 ++++++++--- .../binary_classification_pipeline_mixin.py | 5 ++- evalml/pipelines/classification_pipeline.py | 10 +++--- evalml/pipelines/component_graph.py | 14 ++++---- evalml/pipelines/components/component_base.py | 35 ++++++++++++------- .../components/component_base_meta.py | 6 ++-- .../ensemble/sklearn_stacked_ensemble_base.py | 2 +- .../classifiers/catboost_classifier.py | 4 +-- .../classifiers/elasticnet_classifier.py | 3 +- .../estimators/classifiers/et_classifier.py | 3 +- .../classifiers/kneighbors_classifier.py | 8 ++--- .../logistic_regression_classifier.py | 3 +- .../estimators/classifiers/rf_classifier.py | 3 +- .../estimators/classifiers/svm_classifier.py | 4 +-- .../classifiers/xgboost_classifier.py | 3 +- .../components/estimators/estimator.py | 2 +- .../estimators/regressors/arima_regressor.py | 9 ++--- .../regressors/baseline_regressor.py | 5 +-- .../regressors/catboost_regressor.py | 4 +-- .../regressors/decision_tree_regressor.py | 3 +- .../regressors/prophet_regressor.py | 9 ++--- .../estimators/regressors/svm_regressor.py | 1 + .../time_series_baseline_estimator.py | 1 - .../transformers/column_selectors.py | 12 +++---- .../dimensionality_reduction/lda.py | 3 +- .../dimensionality_reduction/pca.py | 3 +- .../transformers/encoders/target_encoder.py | 3 +- .../feature_selection/feature_selector.py | 3 +- .../rf_classifier_feature_selector.py | 3 +- .../rf_regressor_feature_selector.py | 3 +- .../transformers/imputers/imputer.py | 6 ++-- .../imputers/per_column_imputer.py | 2 +- .../transformers/imputers/simple_imputer.py | 5 ++- .../transformers/imputers/target_imputer.py | 9 ++--- .../preprocessing/datetime_featurizer.py | 5 ++- .../preprocessing/featuretools.py | 2 +- .../preprocessing/text_featurizer.py | 6 ++-- .../transform_primitive_components.py | 3 +- .../transformers/samplers/base_sampler.py | 16 +++------ .../transformers/samplers/oversamplers.py | 7 ++-- .../transformers/samplers/undersampler.py | 3 +- .../transformers/scalers/standard_scaler.py | 1 - .../components/transformers/transformer.py | 5 ++- evalml/pipelines/components/utils.py | 14 +++----- evalml/pipelines/pipeline_base.py | 23 +++++------- evalml/pipelines/pipeline_meta.py | 6 ++-- evalml/pipelines/regression_pipeline.py | 2 +- evalml/pipelines/utils.py | 13 +++---- .../training_validation_split.py | 4 +-- evalml/problem_types/utils.py | 28 ++++++++------- evalml/tests/automl_tests/dask_test_utils.py | 18 ++++++---- .../parallel_tests/test_automl_dask.py | 10 +++--- .../parallel_tests/test_cf_engine.py | 21 ++++------- .../parallel_tests/test_dask_engine.py | 21 ++++------- evalml/tests/automl_tests/test_automl.py | 5 +-- .../test_catboost_classifier.py | 2 +- .../test_catboost_regressor.py | 2 +- .../component_tests/test_lgbm_classifier.py | 2 +- .../component_tests/test_lgbm_regressor.py | 2 +- .../component_tests/test_simple_imputer.py | 5 +-- .../test_xgboost_classifier.py | 2 +- .../component_tests/test_xgboost_regressor.py | 2 +- evalml/tests/conftest.py | 8 ++--- .../data_checks_tests/test_data_check.py | 2 +- .../data_checks_tests/test_data_checks.py | 4 +-- .../test_algorithms.py | 6 ++-- .../test_partial_dependence.py | 7 ++-- .../test_permutation_importance.py | 7 ++-- .../test_binary_classification_objective.py | 6 ++-- .../tests/objective_tests/test_objectives.py | 2 +- .../pipeline_tests/test_component_graph.py | 12 +++---- .../tests/utils_tests/test_woodwork_utils.py | 8 ++--- evalml/tuners/grid_search_tuner.py | 8 ++--- evalml/tuners/random_search_tuner.py | 6 ++-- evalml/tuners/tuner.py | 4 +-- evalml/utils/base_meta.py | 2 +- evalml/utils/gen_utils.py | 14 +++----- evalml/utils/woodwork_utils.py | 14 +++----- 116 files changed, 385 insertions(+), 457 deletions(-) diff --git a/Makefile b/Makefile index f8fda8aa0d..53a404d3b7 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,7 @@ lint: .PHONY: lint-fix lint-fix: + docformatter --in-place . -r --wrap-summaries 0 black -t py39 evalml isort evalml python docs/notebook_version_standardizer.py standardize diff --git a/dev-requirements.txt b/dev-requirements.txt index e2b3876b0d..fd52a4b605 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,4 +4,5 @@ flake8==3.7.0 black==21.5b1 isort==5.0.0 - +docformatter==1.4 +pydocstyle==6.1.1 diff --git a/docs/notebook_version_standardizer.py b/docs/notebook_version_standardizer.py index a8cbdf86c9..48c51676b6 100644 --- a/docs/notebook_version_standardizer.py +++ b/docs/notebook_version_standardizer.py @@ -47,7 +47,7 @@ def _standardize_versions(notebooks, desired_version="3.8.6"): @click.group() def cli(): - """no-op""" + """no-op.""" @cli.command() diff --git a/docs/source/conf.py b/docs/source/conf.py index 08a4830d11..e7be4e2027 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -228,8 +228,7 @@ class AccessorLevelDocumenter(Documenter): - """ - Documenter subclass for objects on accessor level (methods, attributes). + """Documenter subclass for objects on accessor level (methods, attributes). Referenced pandas-sphinx-theme (https://github.com/pandas-dev/pandas-sphinx-theme) and sphinx-doc (https://github.com/sphinx-doc/sphinx/blob/8c7faed6fcbc6b7d40f497698cb80fc10aee1ab3/sphinx/ext/autodoc/__init__.py#L846) @@ -243,10 +242,7 @@ def resolve_name(self, modname, parents, path, base): class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter): - """ - This documenter lets us removes .__call__ from the method signature for - callable accessors like Series.plot - """ + """This documenter lets us removes .__call__ from the method signature for callable accessors like Series.plot.""" objtype = "accessorcallable" directivetype = "method" @@ -269,7 +265,7 @@ class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): class PatchedPythonDomain(PythonDomain): - """To disable cross-reference warning: https://github.com/sphinx-doc/sphinx/issues/3866""" + """To disable cross-reference warning: https://github.com/sphinx-doc/sphinx/issues/3866.""" def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode): if 'refspecific' in node: del node['refspecific'] diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index d8f4b03b52..77919f7ac6 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -5,14 +5,13 @@ class AutoMLAlgorithmException(Exception): - """Exception raised when an error is encountered during the computation of the automl algorithm""" + """Exception raised when an error is encountered during the computation of the automl algorithm.""" pass class AutoMLAlgorithm(ABC): - """ - Base class for the AutoML algorithms which power EvalML. + """Base class for the AutoML algorithms which power EvalML. This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with. @@ -51,14 +50,14 @@ def __init__( @abstractmethod def next_batch(self): - """Get the next batch of pipelines to evaluate + """Get the next batch of pipelines to evaluate. Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): - """Register results from evaluating a pipeline + """Register results from evaluating a pipeline. Arguments: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. diff --git a/evalml/automl/automl_algorithm/evalml_algorithm.py b/evalml/automl/automl_algorithm/evalml_algorithm.py index 495b91515f..ace23de0c2 100644 --- a/evalml/automl/automl_algorithm/evalml_algorithm.py +++ b/evalml/automl/automl_algorithm/evalml_algorithm.py @@ -265,7 +265,7 @@ def _create_long_exploration(self, n): return self._create_n_pipelines(pipelines, self.num_long_explore_pipelines) def next_batch(self): - """Get the next batch of pipelines to evaluate + """Get the next batch of pipelines to evaluate. Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index cd7fec3828..b80a9f443e 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -22,8 +22,7 @@ class IterativeAlgorithm(AutoMLAlgorithm): - """ - An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. + """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. Arguments: allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. @@ -131,7 +130,7 @@ def __init__( ) def next_batch(self): - """Get the next batch of pipelines to evaluate + """Get the next batch of pipelines to evaluate. Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. @@ -197,7 +196,7 @@ def next_batch(self): return next_batch def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): - """Register results from evaluating a pipeline + """Register results from evaluating a pipeline. Arguments: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index f9184f9d42..30275f5958 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -922,8 +922,7 @@ def search(self, show_iteration_plot=True): self._searched = True def _find_best_pipeline(self): - """Finds the best pipeline in the rankings - If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding""" + """Finds the best pipeline in the rankings If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding.""" if len(self.rankings) == 0: return best_pipeline = self.rankings.iloc[0] @@ -942,7 +941,7 @@ def _find_best_pipeline(self): self._best_pipeline = best_pipeline def _num_pipelines(self): - """Return the number of pipeline evaluations which have been made + """Return the number of pipeline evaluations which have been made. Returns: int: the number of pipeline evaluations made in the search @@ -1198,8 +1197,7 @@ def _check_for_high_variance(self, pipeline, cv_scores, threshold=0.5): return high_variance_cv def get_pipeline(self, pipeline_id): - """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline - initialized with the parameters used to train that pipeline during automl search. + """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline initialized with the parameters used to train that pipeline during automl search. Arguments: pipeline_id (int): pipeline to retrieve @@ -1219,7 +1217,7 @@ def get_pipeline(self, pipeline_id): return pipeline.new(parameters, random_seed=self.random_seed) def describe_pipeline(self, pipeline_id, return_dict=False): - """Describe a pipeline + """Describe a pipeline. Arguments: pipeline_id (int): pipeline to describe @@ -1335,7 +1333,7 @@ def rankings(self): @property def full_rankings(self): - """Returns a pandas.DataFrame with scoring results from all pipelines searched""" + """Returns a pandas.DataFrame with scoring results from all pipelines searched.""" ascending = True if self.objective.greater_is_better: ascending = False @@ -1388,7 +1386,7 @@ def save( pickle_type="cloudpickle", pickle_protocol=cloudpickle.DEFAULT_PROTOCOL, ): - """Saves AutoML object at file path + """Saves AutoML object at file path. Arguments: file_path (str): location to save file @@ -1415,7 +1413,7 @@ def load( file_path, pickle_type="cloudpickle", ): - """Loads AutoML object at file path + """Loads AutoML object at file path. Arguments: file_path (str): location to find file to load diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index 77f8a20012..fe32ed7b62 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -9,14 +9,20 @@ def silent_error_callback(exception, traceback, automl, **kwargs): def raise_error_callback(exception, traceback, automl, **kwargs): - """Raises the exception thrown by the AutoMLSearch object. Also logs the exception as an error.""" + """Raises the exception thrown by the AutoMLSearch object. + + Also logs the exception as an error. + """ logger.error(f"AutoML search raised a fatal exception: {str(exception)}") logger.error("\n".join(traceback)) raise exception def log_error_callback(exception, traceback, automl, **kwargs): - """Logs the exception thrown as an error. Will not throw. This is the default behavior for AutoMLSearch.""" + """Logs the exception thrown as an error. + + Will not throw. This is the default behavior for AutoMLSearch. + """ fold_num = kwargs.get("fold_num") pipeline = kwargs.get("pipeline") trace = "\n".join(traceback) if traceback else "" diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index a0fbd834d6..5060725fdd 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -49,8 +49,7 @@ def done(self): return self.work.done() def get_result(self): - """Gets the computation result. - Will block until the computation is finished. + """Gets the computation result. Will block until the computation is finished. Raises: Exception: If computation fails. Returns traceback. @@ -80,7 +79,7 @@ def is_cancelled(self): class CFEngine(EngineBase): - """The concurrent.futures (CF) engine""" + """The concurrent.futures (CF) engine.""" def __init__(self, client): if not isinstance(client, CFClient): diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index 9cd2c0425d..fa5ec84ab7 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -29,8 +29,7 @@ def done(self): return self.work.done() def get_result(self): - """Gets the computation result. - Will block until the computation is finished. + """Gets the computation result. Will block until the computation is finished. Raises: Exception: If computation fails. Returns traceback. @@ -51,7 +50,7 @@ def is_cancelled(self): class DaskEngine(EngineBase): - """The dask engine""" + """The dask engine.""" def __init__(self, client): if not isinstance(client, Client): diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index edc51bfb8b..daabc91353 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -20,8 +20,7 @@ class EngineComputation(ABC): @abstractmethod def get_result(self): - """Gets the computation result. - Will block until the computation is finished. + """Gets the computation result. Will block until the computation is finished. Raises Exception: If computation fails. Returns traceback. """ @@ -38,8 +37,9 @@ def cancel(self): class JobLogger: """Mimics the behavior of a python logging.Logger but stores all messages rather than actually logging them. - This is used during engine jobs so that log messages are recorded after the job completes. This is desired so that - all of the messages for a single job are grouped together in the log. + This is used during engine jobs so that log messages are recorded + after the job completes. This is desired so that all of the messages + for a single job are grouped together in the log. """ def __init__(self): @@ -62,7 +62,10 @@ def error(self, msg): self.logs.append(("error", msg)) def write_to_logger(self, logger): - """Write all the messages to the logger. First In First Out order.""" + """Write all the messages to the logger. + + First In First Out order. + """ logger_method = { "info": logger.info, "debug": logger.debug, @@ -142,7 +145,7 @@ def train_pipeline(pipeline, X, y, automl_config, schema=True): def train_and_score_pipeline( pipeline, automl_config, full_X_train, full_y_train, logger ): - """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores + """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores. Arguments: pipeline (PipelineBase): The pipeline to score diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index cb656fc194..acabde2204 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -31,8 +31,7 @@ def done(self): return True def get_result(self): - """Gets the computation result. - Will block until the computation is finished. + """Gets the computation result. Will block until the computation is finished. Raises Exception: If computation fails. Returns traceback. """ @@ -43,7 +42,10 @@ def cancel(self): class SequentialEngine(EngineBase): - """The default engine for the AutoML search. Trains and scores pipelines locally and sequentially.""" + """The default engine for the AutoML search. + + Trains and scores pipelines locally and sequentially. + """ def submit_evaluation_job(self, automl_config, pipeline, X, y): logger = self.setup_job_log() diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 16f69e73f6..357fd32816 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -102,7 +102,7 @@ def make_data_splitter( def tune_binary_threshold( pipeline, objective, problem_type, X_threshold_tuning, y_threshold_tuning ): - """Tunes the threshold of a binary pipeline to the X and y thresholding data + """Tunes the threshold of a binary pipeline to the X and y thresholding data. Arguments: pipeline (Pipeline): Pipeline instance to threshold. @@ -208,8 +208,7 @@ def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): def get_pipelines_from_component_graphs( component_graphs_dict, problem_type, parameters=None, random_seed=0 ): - """ - Returns created pipelines from passed component graphs based on the specified problem type. + """Returns created pipelines from passed component graphs based on the specified problem type. Arguments: component_graphs_dict (dict): The dict of component graphs. diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index db24accbde..36715c425e 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -42,8 +42,7 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): self.cv_folds = num_cv_folds * 2 def validate(self, X, y): - """Checks if any target labels are imbalanced beyond a threshold for binary and multiclass problems - Ignores NaN values in target labels if they appear. + """Checks if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. Arguments: X (pd.DataFrame, np.ndarray): Features. Ignored. diff --git a/evalml/data_checks/data_check.py b/evalml/data_checks/data_check.py index 615332fbf5..fc26243337 100644 --- a/evalml/data_checks/data_check.py +++ b/evalml/data_checks/data_check.py @@ -5,7 +5,11 @@ class DataCheck(ABC): - """Base class for all data checks. Data checks are a set of heuristics used to determine if there are problems with input data.""" + """Base class for all data checks. + + Data checks are a set of heuristics used to determine if there are + problems with input data. + """ @classproperty def name(cls): @@ -14,8 +18,7 @@ def name(cls): @abstractmethod def validate(self, X, y=None): - """ - Inspects and validates the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. + """Inspects and validates the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. Arguments: X (pd.DataFrame): The input data of shape [n_samples, n_features] diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 2065c4d944..f7fa85024e 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -11,7 +11,11 @@ def __init__(self, action_code, metadata=None): self.metadata = metadata or {} def __eq__(self, other): - """Checks for equality. Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent.""" + """Checks for equality. + + Two DataCheckAction objs are considered equivalent if all of + their attributes are equivalent. + """ return self.action_code == other.action_code and self.metadata == other.metadata def to_dict(self): diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 36fb89a2b7..59243c7fb9 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -24,7 +24,11 @@ def __str__(self): return self.message def __eq__(self, other): - """Checks for equality. Two DataCheckMessage objs are considered equivalent if all of their attributes are equivalent.""" + """Checks for equality. + + Two DataCheckMessage objs are considered equivalent if all of + their attributes are equivalent. + """ return ( self.message_type == other.message_type and self.message == other.message diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index b0b0b8d9ac..7bb3887b28 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -73,8 +73,7 @@ def _init_data_checks(data_check_classes, params): return data_check_instances def __init__(self, data_checks=None, data_check_params=None): - """ - A collection of data checks. + """A collection of data checks. Arguments: data_checks (list (DataCheck)): List of DataCheck objects @@ -86,8 +85,7 @@ def __init__(self, data_checks=None, data_check_params=None): self.data_checks = data_check_instances def validate(self, X, y=None): - """ - Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable. + """Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable. Arguments: X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] @@ -95,7 +93,6 @@ def validate(self, X, y=None): Returns: dict: Dictionary containing DataCheckMessage objects - """ messages = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 177737512e..b8aa0e9883 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -5,12 +5,10 @@ class DateTimeFormatDataCheck(DataCheck): - """Checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order - to be supported by time series estimators. + """Checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. Arguments: datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index". - """ def __init__(self, datetime_column="index"): diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 66b1692185..5f247edd8a 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -16,7 +16,6 @@ class HighlyNullDataCheck(DataCheck): that column will be considered highly-null. Defaults to 0.95. pct_null_row_threshold(float): If the percentage of NaN values in an input row exceeds this amount, that row will be considered highly-null. Defaults to 0.95. - """ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): @@ -73,7 +72,6 @@ def validate(self, X, y=None): "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}},\ {"code": "DROP_COL",\ "metadata": {"column": "lots_of_null"}}]} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 32f7356cec..6c668e9a49 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -26,7 +26,6 @@ def validate(self, X, y=None): Returns: dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 70fa2e2edc..cde4c34075 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -10,7 +10,10 @@ class OutliersDataCheck(DataCheck): - """Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers.""" + """Checks if there are any outliers in input data by using IQR to determine score anomalies. + + Columns with score anomalies are considered to contain outliers. + """ def validate(self, X, y=None): """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. @@ -77,24 +80,7 @@ def validate(self, X, y=None): @staticmethod def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: - """ - This functions calculates the probability that there are no true - outliers in a numeric (integer or float) column. It is based on creating - 100,000 samples consisting of a given number of records, and - then repeating this over a grid of sample sizes. Each value in a sample - is drawn from a log normal distribution, and then the number of - potential outliers in the data is determined using the skew adjusted box - plot approach based on the medcouple statistic. It was observed that the - distribution of the percentage of outliers could be described by a gamma - distribution, with the shape and scale parameters changing with the - sample size. For each sample size, the shape and scale parameters of the - gamma distriubtion were estimated using maximum likelihood methods. The - set of estimate shape and scale parameters for different sample size were - then used to fit equations that relate these two parameters to the sample - size. These equations use a transendental logrithmic functional form that - provides a seventh order Taylor series approximation to the two true - functional relationships, and was estimated using least squares - regression. + """This functions calculates the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. Original credit goes to Jad Raad and Dan Putler of Alteryx. diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index 64dbe68fdf..636cab24e9 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -36,8 +36,7 @@ def __init__(self, problem_type, threshold, unique_count_threshold=10): raise ValueError("Unique count threshold must be positive integer.") def validate(self, X, y=None): - """Calculates what percentage of each column's unique values exceed the count threshold and compare - that percentage to the sparsity threshold stored in the class instance. + """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: X (pd.DataFrame, np.ndarray): Features. @@ -95,8 +94,7 @@ def validate(self, X, y=None): @staticmethod def sparsity_score(col, count_threshold=10): - """This function calculates a sparsity score for the given value counts by calculating the percentage of - unique values that exceed the count_threshold. + """This function calculates a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. Arguments: col (pd.Series): Feature values. diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index 23dd71e5b5..54137945d7 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -14,8 +14,7 @@ class TargetDistributionDataCheck(DataCheck): - """Checks if the target data contains certain distributions that may need to be transformed prior training to - improve model performance.""" + """Checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" def validate(self, X, y): """Checks if the target data has a certain distribution. diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 18c37b033a..479bc91b3c 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -14,8 +14,7 @@ class TargetLeakageDataCheck(DataCheck): - """ - Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. + """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, this data check uses mutual information and supports all target and feature types. Otherwise, if `method='pearson'`, it uses Pearson correlation and only supports binary with numeric and boolean dtypes. diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 69dc3e132d..403b20d733 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -19,15 +19,13 @@ class UniquenessDataCheck(DataCheck): - """Checks if there are any columns in the input that are either too unique for classification problems - or not unique enough for regression problems. + """Checks if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. Arguments: problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems or lower bound on for regression type problems. Defaults to 0.50. - """ def __init__(self, problem_type, threshold=0.50): @@ -37,8 +35,7 @@ def __init__(self, problem_type, threshold=0.50): self.threshold = threshold def validate(self, X, y=None): - """Checks if there are any columns in the input that are too unique in the case of classification - problems or not unique enough in the case of regression problems. + """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (pd.DataFrame, np.ndarray): Features. @@ -126,8 +123,7 @@ def validate(self, X, y=None): @staticmethod def uniqueness_score(col): - """This function calculates a uniqueness score for the provided field. NaN values are - not considered as unique values in the calculation. + """This function calculates a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl–Hirschman Index. diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index 259531e0e0..fb2f4182ae 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -3,8 +3,7 @@ class EmptyDataChecks(DataChecks): def __init__(self, data_checks=None): - """ - An empty collection of data checks. + """An empty collection of data checks. Arguments: data_checks (list (DataCheck)): Ignored. diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py index 3cd06a81f8..428cfa3f66 100644 --- a/evalml/exceptions/exceptions.py +++ b/evalml/exceptions/exceptions.py @@ -8,7 +8,7 @@ class MethodPropertyNotFoundError(Exception): class PipelineNotFoundError(Exception): - """An exception raised when a particular pipeline is not found in automl search results""" + """An exception raised when a particular pipeline is not found in automl search results.""" pass @@ -81,7 +81,7 @@ class DataCheckInitError(Exception): class NullsInColumnWarning(UserWarning): - """Warning thrown when there are null values in the column of interest""" + """Warning thrown when there are null values in the column of interest.""" class ObjectiveCreationError(Exception): @@ -89,7 +89,7 @@ class ObjectiveCreationError(Exception): class NoPositiveLabelException(Exception): - """Exception when a particular classification label for the 'positive' class cannot be found in the column index or unique values""" + """Exception when a particular classification label for the 'positive' class cannot be found in the column index or unique values.""" class ParameterNotUsedWarning(UserWarning): diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index 8e9a39813b..c3cfe64f31 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -2,7 +2,7 @@ def handle_model_family(model_family): - """Handles model_family by either returning the ModelFamily or converting from a string + """Handles model_family by either returning the ModelFamily or converting from a string. Arguments: model_family (str or ModelFamily): Model type that needs to be handled diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index d3557c012f..a2dad56481 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -182,8 +182,7 @@ def graph_confusion_matrix( def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): - """ - Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. + """Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. Arguments: y_true (pd.Series or np.ndarray): True binary labels. @@ -260,8 +259,7 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): def roc_curve(y_true, y_pred_proba): - """ - Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. + """Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. Arguments: y_true (pd.Series or np.ndarray): True labels. @@ -471,7 +469,6 @@ def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): Returns: plotly.Figure representing the objective score vs. threshold graph generated - """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -505,13 +502,14 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): - """If the user is doing a two-way partial dependence plot and one of the features is categorical, - we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. - - This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the - type of the first feature in the tuple. If we put the categorical feature first, the grid will be of type 'object' - which can accommodate both categorical and numeric data. If we put the numeric feature first, the grid will be of - type float64 and we can't coerce categoricals to float64 dtype. + """If the user is doing a two-way partial dependence plot and one of the features is categorical, we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. + + This is because in the two-way grid calculation, sklearn will try to + coerce every element of the grid to the type of the first feature in + the tuple. If we put the categorical feature first, the grid will be + of type 'object' which can accommodate both categorical and numeric + data. If we put the numeric feature first, the grid will be of type + float64 and we can't coerce categoricals to float64 dtype. """ new_features = features if first_feature_categorical else (features[1], features[0]) return new_features @@ -933,11 +931,7 @@ def _update_fig_with_two_way_partial_dependence( def graph_partial_dependence( pipeline, X, features, class_label=None, grid_resolution=100, kind="average" ): - """Create an one-way or two-way partial dependence plot. Passing a single integer or - string as features will create a one-way partial dependence plot with the feature values - plotted against the partial dependence. Passing features a tuple of int/strings will create - a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] - in the x-axis and the partial dependence in the z-axis. + """Create an one-way or two-way partial dependence plot. Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. Arguments: pipeline (PipelineBase or subclass): Fitted pipeline. @@ -1194,7 +1188,6 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): * `prediction`: Predicted values from regression model. * `actual`: Real target values. * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. - """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1220,7 +1213,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): - """Generate a scatter plot comparing the true and predicted values. Used for regression plotting + """Generate a scatter plot comparing the true and predicted values. Used for regression plotting. Arguments: y_true (pd.Series): The real target values of the data @@ -1231,7 +1224,6 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): Returns: plotly.Figure representing the predicted vs. actual values graph - """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1304,7 +1296,7 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): - """Return data for a fitted tree in a restructured format + """Return data for a fitted tree in a restructured format. Arguments: estimator (ComponentBase): A fitted DecisionTree-based estimator. @@ -1599,7 +1591,6 @@ def graph_t_sne( Returns: plotly.Figure representing the transformed data - """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index 5f2c50dc77..6f2f18fd1e 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -228,9 +228,10 @@ def _calculate_permutation_scores_fast( def _slow_permutation_importance( pipeline, X, y, objective, col_name=None, n_repeats=5, n_jobs=None, random_seed=None ): - """ - If `col_name` is not None, calculates permutation importance for only the column with that name. Otherwise, calculates the - permutation importance for all columns in the input dataframe. + """If `col_name` is not None, calculates permutation importance for only the column with that name. + + Otherwise, calculates the permutation importance for all columns in + the input dataframe. """ baseline_score = _slow_scorer(pipeline, X, y, objective) if col_name is None: diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index 4ae98a08be..cdf4dc6f61 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -182,11 +182,12 @@ def make_drill_down_dict( original_features, include_shap_values, ): - """Format the 'drill_down' section of the explanation report when output_format="dict" + """Format the 'drill_down' section of the explanation report when output_format="dict". - This section will include the feature values, feature names, qualitative explanation - and shap values (if include_shap_values=True) for the features created from one of the - original features in the data. + This section will include the feature values, feature names, + qualitative explanation and shap values (if + include_shap_values=True) for the features created from one of + the original features in the data. """ drill_down = {} for parent_feature, children_features in provenance.items(): diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index cb27003aa3..64a2347359 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -108,11 +108,10 @@ def explain_predictions( def _update_progress(start_time, current_time, progress_stage, callback_function): - """ - Helper function for updating progress of a function and making a call to the user-provided callback - function, if provided. The callback function should accept the following parameters: - - progress_stage: stage of computation - - time_elapsed: total time in seconds that has elapsed since start of call + """Helper function for updating progress of a function and making a call to the user-provided callback function, if provided. The callback function should accept the following parameters: + + - progress_stage: stage of computation + - time_elapsed: total time in seconds that has elapsed since start of call """ if callback_function is not None: elapsed_time = current_time - start_time diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index ed190e33c0..d067ef496d 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -15,7 +15,11 @@ class BinaryClassificationObjective(ObjectiveBase): @property def can_optimize_threshold(cls): """Returns a boolean determining if we can optimize the binary classification objective threshold. - This will be false for any objective that works directly with predicted probabilities, like log loss and AUC. Otherwise, it will be true.""" + + This will be false for any objective that works directly with + predicted probabilities, like log loss and AUC. Otherwise, it + will be true. + """ return not cls.score_needs_proba def optimize_threshold(self, ypred_proba, y_true, X=None): diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index 15cc19fa5b..ff45ff6a5a 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -6,10 +6,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): - """ - Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric - scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. - For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow. + """Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow. Arguments: true_positive (float): Cost associated with true positive predictions diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 42b0b3837d..6aca76d283 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -28,7 +28,13 @@ def greater_is_better(cls): @classmethod @abstractmethod def score_needs_proba(cls): - """Returns a boolean determining if the score() method needs probability estimates. This should be true for objectives which work with predicted probabilities, like log loss or AUC, and false for objectives which compare predicted class labels to the actual labels, like F1 or correlation.""" + """Returns a boolean determining if the score() method needs probability estimates. + + This should be true for objectives which work with predicted + probabilities, like log loss or AUC, and false for objectives + which compare predicted class labels to the actual labels, like + F1 or correlation. + """ @property @classmethod @@ -46,12 +52,16 @@ def is_bounded_like_percentage(cls): @classmethod @abstractmethod def expected_range(cls): - """Returns the expected range of the objective, which is not necessarily the possible ranges. For example, our expected R2 range is from [-1, 1], although the actual range is (-inf, 1].""" + """Returns the expected range of the objective, which is not necessarily the possible ranges. + + For example, our expected R2 range is from [-1, 1], although the + actual range is (-inf, 1]. + """ @classmethod @abstractmethod def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): - """Computes the relative value of the provided predictions compared to the actual labels, according a specified metric + """Computes the relative value of the provided predictions compared to the actual labels, according a specified metric. Arguments: y_predicted (pd.Series): Predicted values of length [n_samples] @@ -65,7 +75,10 @@ def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(cls): - """If True, this objective is only valid for positive data. Default False.""" + """If True, this objective is only valid for positive data. + + Default False. + """ return False def score(self, y_true, y_predicted, X=None, sample_weight=None): diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index 0c5753ce1e..b75d0c61d5 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -16,11 +16,10 @@ class SensitivityLowAlert(BinaryClassificationObjective): expected_range = [0, 1] def __init__(self, alert_rate=0.01): - """Create instance of SensitivityLowAlert + """Create instance of SensitivityLowAlert. Arguments: alert_rate (float): percentage of top scores to classify as high risk - """ if (alert_rate > 1) or (alert_rate < 0): raise ValueError("Alert rate is outside of valid range [0,1]") @@ -28,7 +27,7 @@ def __init__(self, alert_rate=0.01): self.alert_rate = alert_rate def decision_function(self, ypred_proba, **kwargs): - """Determine if an observation is high risk given an alert rate + """Determine if an observation is high risk given an alert rate. Arguments: ypred_proba (pd.Series): Predicted probabilities @@ -47,7 +46,7 @@ def decision_function(self, ypred_proba, **kwargs): return ypred_proba.astype(float) >= prob_thresh def objective_function(self, y_true, y_predicted, **kwargs): - """Calculate sensitivity across all predictions, using the top alert_rate percent of observations as the predicted positive class + """Calculate sensitivity across all predictions, using the top alert_rate percent of observations as the predicted positive class. Arguments: y_true (pd.Series): True labels diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py index 616c02e873..095b68d494 100644 --- a/evalml/objectives/standard_metrics.py +++ b/evalml/objectives/standard_metrics.py @@ -467,7 +467,8 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): class RootMeanSquaredLogError(RegressionObjective): """Root mean squared log error for regression. - Only valid for nonnegative inputs.Otherwise, will throw a ValueError. + Only valid for nonnegative inputs.Otherwise, will throw a + ValueError. """ name = "Root Mean Squared Log Error" @@ -486,14 +487,18 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. Default False.""" + """If True, this objective is only valid for positive data. + + Default False. + """ return True class MeanSquaredLogError(RegressionObjective): """Mean squared log error for regression. - Only valid for nonnegative inputs. Otherwise, will throw a ValueError + Only valid for nonnegative inputs. Otherwise, will throw a + ValueError """ name = "Mean Squared Log Error" @@ -510,7 +515,10 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. Default False.""" + """If True, this objective is only valid for positive data. + + Default False. + """ return True @@ -572,7 +580,10 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. Default False.""" + """If True, this objective is only valid for positive data. + + Default False. + """ return True diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 53008ea1a3..0438ac425d 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -3,7 +3,10 @@ class BinaryClassificationPipelineMixin: @property def threshold(self): - """Threshold used to make a prediction. Defaults to None.""" + """Threshold used to make a prediction. + + Defaults to None. + """ return self._threshold @threshold.setter diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index dbe3f2994b..1988169589 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -36,8 +36,7 @@ def __init__( ) def fit(self, X, y): - """Build a classification model. For string and categorical targets, classes are sorted - by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. + """Build a classification model. For string and categorical targets, classes are sorted by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] @@ -45,7 +44,6 @@ def fit(self, X, y): Returns: self - """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -63,9 +61,11 @@ def _encode_targets(self, y): def _decode_targets(self, y): """Converts encoded numerical values to their original target values. + Note: we cast y as ints first to address boolean values that may be returned from calculating predictions which we would not be able to otherwise transform if we - originally had integer targets.""" + originally had integer targets. + """ return self._encoder.inverse_transform(y.astype(int)) @property @@ -129,7 +129,7 @@ def predict_proba(self, X): return infer_feature_types(proba) def score(self, X, y, objectives): - """Evaluate model performance on objectives + """Evaluate model performance on objectives. Arguments: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 5f634a1f78..44a0ed8245 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -116,8 +116,7 @@ def default_parameters(self): return defaults def instantiate(self, parameters): - """Instantiates all uninstantiated components within the graph using the given parameters. An error will be - raised if a component is already instantiated but the parameters dict contains arguments for that component. + """Instantiates all uninstantiated components within the graph using the given parameters. An error will be raised if a component is already instantiated but the parameters dict contains arguments for that component. Arguments: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. @@ -180,8 +179,7 @@ def fit_features(self, X, y): return self._fit_transform_features_helper(True, X, y) def compute_final_component_features(self, X, y=None): - """Transform all components save the final one, and gathers the data from any number of parents - to get all the information that should be fed to the final component. + """Transform all components save the final one, and gathers the data from any number of parents to get all the information that should be fed to the final component. Arguments: X (pd.DataFrame): Data of shape [n_samples, n_features]. @@ -481,7 +479,7 @@ def get_inputs(self, component_name): return [] def describe(self, return_dict=False): - """Outputs component graph details including component parameters + """Outputs component graph details including component parameters. Arguments: return_dict (bool): If True, return dictionary of information about component graph. Defaults to False. @@ -504,7 +502,7 @@ def describe(self, return_dict=False): return components def graph(self, name=None, graph_format=None): - """Generate an image representing the component graph + """Generate an image representing the component graph. Arguments: name (str): Name of the graph. Defaults to None. @@ -565,7 +563,7 @@ def _get_edges(component_dict): @classmethod def generate_order(cls, component_dict): - """Regenerated the topologically sorted order of the graph""" + """Regenerated the topologically sorted order of the graph.""" edges = cls._get_edges(component_dict) if len(component_dict) == 1: return list(component_dict.keys()) @@ -602,7 +600,7 @@ def __iter__(self): return self def __next__(self): - """Iterator for graphs, retrieves the components in the graph in order + """Iterator for graphs, retrieves the components in the graph in order. Returns: ComponentBase obj: The next component class or instance in the graph diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index ddcbb4b2fb..543d7299fb 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -44,39 +44,48 @@ def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs) @classmethod @abstractmethod def name(cls): - """Returns string name of this component""" + """Returns string name of this component.""" @property @classmethod @abstractmethod def model_family(cls): - """Returns ModelFamily of this component""" + """Returns ModelFamily of this component.""" @property @classmethod @abstractmethod def modifies_features(cls): """Returns whether this component modifies (subsets or transforms) the features variable during transform. - For Estimator objects, this attribute determines if the return value from `predict` or `predict_proba` should be used as features or targets.""" + + For Estimator objects, this attribute determines if the return + value from `predict` or `predict_proba` should be used as + features or targets. + """ @property @classmethod @abstractmethod def modifies_target(cls): """Returns whether this component modifies (subsets or transforms) the target variable during transform. - For Estimator objects, this attribute determines if the return value from `predict` or `predict_proba` should be used as features or targets.""" + + For Estimator objects, this attribute determines if the return + value from `predict` or `predict_proba` should be used as + features or targets. + """ @classproperty def needs_fitting(self): - """Returns boolean determining if component needs fitting before - calling predict, predict_proba, transform, or feature_importances. - This can be overridden to False for components that do not need to be fit - or whose fit methods do nothing.""" + """Returns boolean determining if component needs fitting before calling predict, predict_proba, transform, or feature_importances. + + This can be overridden to False for components that do not need + to be fit or whose fit methods do nothing. + """ return True @property def parameters(self): - """Returns the parameters which were used to initialize the component""" + """Returns the parameters which were used to initialize the component.""" return copy.copy(self._parameters) @classproperty @@ -107,7 +116,7 @@ def clone(self): return self.__class__(**self.parameters, random_seed=self.random_seed) def fit(self, X, y=None): - """Fits component to data + """Fits component to data. Arguments: X (list, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] @@ -128,7 +137,7 @@ def fit(self, X, y=None): ) def describe(self, print_name=False, return_dict=False): - """Describe a component and its parameters + """Describe a component and its parameters. Arguments: print_name(bool, optional): whether to print name of component @@ -151,7 +160,7 @@ def describe(self, print_name=False, return_dict=False): return component_dict def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): - """Saves component at file path + """Saves component at file path. Arguments: file_path (str): Location to save file @@ -165,7 +174,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): @staticmethod def load(file_path): - """Loads component at file path + """Loads component at file path. Arguments: file_path (str): Location to load file diff --git a/evalml/pipelines/components/component_base_meta.py b/evalml/pipelines/components/component_base_meta.py index 8520382c9c..f9ea2551fc 100644 --- a/evalml/pipelines/components/component_base_meta.py +++ b/evalml/pipelines/components/component_base_meta.py @@ -5,12 +5,14 @@ class ComponentBaseMeta(BaseMeta): - """Metaclass that overrides creating a new component by wrapping methods with validators and setters""" + """Metaclass that overrides creating a new component by wrapping methods with validators and setters.""" @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + + It raises an exception if `False` and calls and returns the + wrapped method if `True`. """ @wraps(method) diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index 363623c199..3ded6e7890 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -94,7 +94,7 @@ def __init__( @property def feature_importance(self): - """Not implemented for SklearnStackedEnsembleClassifier and SklearnStackedEnsembleRegressor""" + """Not implemented for SklearnStackedEnsembleClassifier and SklearnStackedEnsembleRegressor.""" raise NotImplementedError( "feature_importance is not implemented for SklearnStackedEnsembleClassifier and SklearnStackedEnsembleRegressor" ) diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index 412262e06f..99520f618d 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -13,9 +13,7 @@ class CatBoostClassifier(Estimator): - """ - CatBoost Classifier, a classifier that uses gradient-boosting on decision trees. - CatBoost is an open-source library and natively supports categorical features. + """CatBoost Classifier, a classifier that uses gradient-boosting on decision trees. CatBoost is an open-source library and natively supports categorical features. For more information, check out https://catboost.ai/ diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py index 4e344a8b5d..58354738d5 100644 --- a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py @@ -10,8 +10,7 @@ class ElasticNetClassifier(Estimator): - """ - Elastic Net Classifier. Uses Logistic Regression with elasticnet penalty as the base estimator. + """Elastic Net Classifier. Uses Logistic Regression with elasticnet penalty as the base estimator. Arguments: penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "elasticnet". diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py index da43e4237e..4d550332e2 100644 --- a/evalml/pipelines/components/estimators/classifiers/et_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/et_classifier.py @@ -7,8 +7,7 @@ class ExtraTreesClassifier(Estimator): - """ - Extra Trees Classifier. + """Extra Trees Classifier. Arguments: n_estimators (float): The number of trees in the forest. Defaults to 100. diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py index 7635cd15e9..f74b395aa4 100644 --- a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py @@ -8,8 +8,7 @@ class KNeighborsClassifier(Estimator): - """ - K-Nearest Neighbors Classifier. + """K-Nearest Neighbors Classifier. Arguments: n_neighbors (int): Number of neighbors to use by default. Defaults to 5. @@ -93,9 +92,6 @@ def __init__( @property def feature_importance(self): - """ - Returns array of 0's matching the input number of features as feature_importance is - not defined for KNN classifiers. - """ + """Returns array of 0's matching the input number of features as feature_importance is not defined for KNN classifiers.""" num_features = self._component_obj.n_features_in_ return np.zeros(num_features) diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py index 9954a34806..fc02f27a8d 100644 --- a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py @@ -8,8 +8,7 @@ class LogisticRegressionClassifier(Estimator): - """ - Logistic Regression Classifier. + """Logistic Regression Classifier. Arguments: penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "l2". diff --git a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py index 347c7195f2..7f924c9540 100644 --- a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py @@ -7,8 +7,7 @@ class RandomForestClassifier(Estimator): - """ - Random Forest Classifier. + """Random Forest Classifier. Arguments: n_estimators (float): The number of trees in the forest. Defaults to 100. diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index f9745d6f53..171032a808 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -8,8 +8,7 @@ class SVMClassifier(Estimator): - """ - Support Vector Machine Classifier. + """Support Vector Machine Classifier. Arguments: C (float): The regularization parameter. The strength of the regularization is inversely proportional to C. @@ -72,6 +71,7 @@ def __init__( @property def feature_importance(self): """Feature importance only works with linear kernels. + If the kernel isn't linear, we return a numpy array of zeros """ if self._parameters["kernel"] != "linear": diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index 5e7b2c3dec..f829290d5b 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -10,8 +10,7 @@ class XGBoostClassifier(Estimator): - """ - XGBoost Classifier. + """XGBoost Classifier. Arguments: eta (float): Boosting learning rate. Defaults to 0.1. diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index d7919f4008..0c2ac012cc 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -37,7 +37,7 @@ class Estimator(ComponentBase): @classmethod @abstractmethod def supported_problem_types(cls): - """Problem types this estimator supports""" + """Problem types this estimator supports.""" def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): self.input_feature_names = None diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 0e8b17fa03..6bf2320399 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -9,10 +9,7 @@ class ARIMARegressor(Estimator): - """ - Autoregressive Integrated Moving Average Model. - The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. - More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html + """Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html. Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI. @@ -196,7 +193,5 @@ def predict(self, X, y=None): @property def feature_importance(self): - """ - Returns array of 0's with a length of 1 as feature_importance is not defined for ARIMA regressor. - """ + """Returns array of 0's with a length of 1 as feature_importance is not defined for ARIMA regressor.""" return np.zeros(1) diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index 570ecf2784..8b1d8d8ca3 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -8,9 +8,7 @@ class BaselineRegressor(Estimator): - """ - Baseline regressor that uses a simple strategy to make predictions. - This is useful as a simple baseline regressor to compare with other regressors. + """Baseline regressor that uses a simple strategy to make predictions. This is useful as a simple baseline regressor to compare with other regressors. Arguments: strategy (str): Method used to predict. Valid options are "mean", "median". Defaults to "mean". @@ -69,6 +67,5 @@ def feature_importance(self): Returns: np.ndarray (float): An array of zeroes - """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index e79f4d201e..2663efc0e2 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -10,9 +10,7 @@ class CatBoostRegressor(Estimator): - """ - CatBoost Regressor, a regressor that uses gradient-boosting on decision trees. - CatBoost is an open-source library and natively supports categorical features. + """CatBoost Regressor, a regressor that uses gradient-boosting on decision trees. CatBoost is an open-source library and natively supports categorical features. For more information, check out https://catboost.ai/ diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index 0fcf9c7c8b..0eae370573 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -7,8 +7,7 @@ class DecisionTreeRegressor(Estimator): - """ - Decision Tree Regressor. + """Decision Tree Regressor. Arguments: criterion ({"mse", "friedman_mse", "mae", "poisson"}): The function to measure the quality of a split. diff --git a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py index 3caca48c00..76050e7b47 100644 --- a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py @@ -12,12 +12,9 @@ class ProphetRegressor(Estimator): - """ - Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. - It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well. + """Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well. More information here: https://facebook.github.io/prophet/ - """ name = "Prophet Regressor" @@ -135,9 +132,7 @@ def get_params(self): @property def feature_importance(self): - """ - Returns array of 0's with len(1) as feature_importance is not defined for Prophet regressor. - """ + """Returns array of 0's with len(1) as feature_importance is not defined for Prophet regressor.""" return np.zeros(1) @classproperty diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index 6e898dbefc..f7334da8b3 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -55,6 +55,7 @@ def __init__(self, C=1.0, kernel="rbf", gamma="scale", random_seed=0, **kwargs): @property def feature_importance(self): """Feature importance only works with linear kernels. + If the kernel isn't linear, we return a numpy array of zeros """ if self._parameters["kernel"] != "linear": diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index ad5e791567..a4c0f4fd70 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -89,6 +89,5 @@ def feature_importance(self): Returns: np.ndarray (float): an array of zeroes - """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index 67a2bb66f5..d60cfd6b94 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -5,8 +5,7 @@ class ColumnSelector(Transformer): - """ - Initalizes an transformer that drops specified columns in input data. + """Initalizes an transformer that drops specified columns in input data. Arguments: columns (list(string)): List of column names, used to determine which columns to select. @@ -60,8 +59,7 @@ def transform(self, X, y=None): class DropColumns(ColumnSelector): - """ - Drops specified columns in input data. + """Drops specified columns in input data. Arguments: columns (list(string)): List of column names, used to determine which columns to drop. @@ -90,8 +88,7 @@ def transform(self, X, y=None): class SelectColumns(ColumnSelector): - """ - Selects specified columns in input data. + """Selects specified columns in input data. Arguments: columns (list(string)): List of column names, used to determine which columns to select. @@ -120,8 +117,7 @@ def transform(self, X, y=None): class SelectByType(ColumnSelector): - """ - Selects columns by specified Woodwork logical type or semantic tag in input data. + """Selects columns by specified Woodwork logical type or semantic tag in input data. Arguments: column_types (string, ww.LogicalType, list(string), list(ww.LogicalType)): List of Woodwork types or tags, used to determine which columns to select. diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index d05836ae82..16fb1047b1 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -10,8 +10,7 @@ class LinearDiscriminantAnalysis(Transformer): - """ - Reduces the number of features by using Linear Discriminant Analysis. + """Reduces the number of features by using Linear Discriminant Analysis. Arguments: n_components (int): The number of features to maintain after computation. Defaults to None. diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 9e3a2ae0d1..84c43ce9cf 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -11,8 +11,7 @@ class PCA(Transformer): - """ - Reduces the number of features by using Principal Component Analysis (PCA). + """Reduces the number of features by using Principal Component Analysis (PCA). Arguments: variance (float): The percentage of the original data variance that should be preserved when reducing the diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index 1f2525227c..96c7474f08 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -14,8 +14,7 @@ class TargetEncoder(Transformer, metaclass=OneHotEncoderMeta): - """ - A transformer that encodes categorical features into target encodings. + """A transformer that encodes categorical features into target encodings. Arguments: cols (list): Columns to encode. If None, all string columns will be encoded, otherwise only the columns provided will be encoded. diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index a77838072b..0c6b8393b4 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -9,8 +9,7 @@ class FeatureSelector(Transformer): - """ - Selects top features based on importance weights. + """Selects top features based on importance weights. Arguments: parameters (dict): Dictionary of parameters for the component. Defaults to None. diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py index d12dc83a07..f2f7fd9b76 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py @@ -7,8 +7,7 @@ class RFClassifierSelectFromModel(FeatureSelector): - """ - Selects top features based on importance weights using a Random Forest classifier. + """Selects top features based on importance weights using a Random Forest classifier. Arguments: number_features (int): The maximum number of features to select. diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py index 1b9e43a362..ecd163bd62 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py @@ -7,8 +7,7 @@ class RFRegressorSelectFromModel(FeatureSelector): - """ - Selects top features based on importance weights using a Random Forest regressor. + """Selects top features based on importance weights using a Random Forest regressor. Arguments: number_features (int): The maximum number of features to select. diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index 613434c1fb..868b66181c 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -76,8 +76,7 @@ def __init__( ) def fit(self, X, y=None): - """Fits imputer to data. 'None' values are converted to np.nan before imputation and are - treated as the same. + """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] @@ -107,8 +106,7 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are - treated as the same. + """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame): Data to transform diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index bfdd02ccfe..e78bf6bb46 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -52,7 +52,7 @@ def __init__( ) def fit(self, X, y=None): - """Fits imputers on input data + """Fits imputers on input data. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 42563b1947..f8694afdd2 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -38,8 +38,7 @@ def __init__( ) def fit(self, X, y=None): - """Fits imputer to data. 'None' values are converted to np.nan before imputation and are - treated as the same. + """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] @@ -106,7 +105,7 @@ def transform(self, X, y=None): return _retain_custom_types_and_initalize_woodwork(original_logical_types, X) def fit_transform(self, X, y=None): - """Fits on X and transforms X + """Fits on X and transforms X. Arguments: X (pd.DataFrame): Data to fit and transform diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index eab81a03e6..74c4615839 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -13,12 +13,14 @@ class TargetImputerMeta(ComponentBaseMeta): - """A version of the ComponentBaseMeta class which handles when input features is None""" + """A version of the ComponentBaseMeta class which handles when input features is None.""" @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + + It raises an exception if `False` and calls and returns the + wrapped method if `True`. """ @wraps(method) @@ -64,8 +66,7 @@ def __init__( ) def fit(self, X, y): - """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are - treated as the same. + """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index d817beb018..11159eb0df 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -56,8 +56,7 @@ def _extract_hour(col, encode_as_categories=False): class DateTimeFeaturizer(Transformer): - """ - Transformer that can automatically extract features from datetime columns. + """Transformer that can automatically extract features from datetime columns. Arguments: features_to_extract (list): List of features to extract. Valid options include "year", "month", "day_of_week", "hour". Defaults to None. @@ -118,7 +117,7 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns + """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns. Arguments: X (pd.DataFrame): Data to transform diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index e6e4feb36a..9fd4a1f63c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -31,7 +31,7 @@ def __init__(self, index="index", random_seed=0, **kwargs): super().__init__(parameters=parameters, random_seed=random_seed) def _make_entity_set(self, X): - """Helper method that creates and returns the entity set given the input data""" + """Helper method that creates and returns the entity set given the input data.""" ft_es = EntitySet() if self.index not in X.columns: es = ft_es.entity_from_dataframe( diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 99f588f350..c6312b11b3 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -39,7 +39,7 @@ def __init__(self, random_seed=0, **kwargs): super().__init__(random_seed=random_seed, **kwargs) def _clean_text(self, X): - """Remove all non-alphanum chars other than spaces, and make lowercase""" + """Remove all non-alphanum chars other than spaces, and make lowercase.""" def normalize(text): text = text.translate(str.maketrans("", "", string.punctuation)) @@ -73,7 +73,7 @@ def _make_entity_set(self, X, text_columns): return es def fit(self, X, y=None): - """Fits component to data + """Fits component to data. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] @@ -113,7 +113,7 @@ def _get_primitives_provenance(features): return provenance def transform(self, X, y=None): - """Transforms data X by creating new features using existing text columns + """Transforms data X by creating new features using existing text columns. Arguments: X (pd.DataFrame): The data to transform. diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py index 17a2356bc3..fff7319de4 100644 --- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py +++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py @@ -30,7 +30,8 @@ def _get_columns_to_transform(self, X): def _get_feature_types_for_featuretools(self, X): """Get a mapping from column name to the feature tools type. - This is needed for dfs. Hopefully, once the ww/ft integration is complete this will be redundant. + This is needed for dfs. Hopefully, once the ww/ft integration is + complete this will be redundant. """ def _make_entity_set(self, X): diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 9eaf86499b..ab82a1d033 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -8,8 +8,7 @@ class BaseSampler(Transformer): - """ - Base Sampler component. Used as the base class of all sampler components. + """Base Sampler component. Used as the base class of all sampler components. Arguments: parameters (dict): Dictionary of parameters for the component. Defaults to None. @@ -76,9 +75,7 @@ def transform(self, X, y=None): return infer_feature_types(X_new), infer_feature_types(y_new) def _convert_dictionary(self, sampling_dict, y): - """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. - Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. - Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. + """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. Arguments: sampling_dict (dict): The input sampling dictionary passed in from user. @@ -112,8 +109,7 @@ def _convert_dictionary(self, sampling_dict, y): return new_dic def _dictionary_to_params(self, sampling_dict, y): - """If a sampling ratio dictionary is provided, add the updated sampling dictionary to the - parameters and return the updated parameter dictionary. Otherwise, simply return the current parameters. + """If a sampling ratio dictionary is provided, add the updated sampling dictionary to the parameters and return the updated parameter dictionary. Otherwise, simply return the current parameters. Arguments: sampling_dict (dict): The input sampling dictionary passed in from user. @@ -135,8 +131,7 @@ def fit_transform(self, X, y): class BaseOversampler(BaseSampler): - """ - Base Oversampler component. Used as the base class of all imbalance-learn oversampler components. + """Base Oversampler component. Used as the base class of all imbalance-learn oversampler components. Arguments: sampler (obj): Sampler object to use. @@ -179,8 +174,7 @@ def __init__( ) def _initialize_sampler(self, X, y): - """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. If a sampler_ratio_dict is provided, we will opt to use that. - Otherwise, we use will create the sampler_ratio_dict dictionary. + """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. If a sampler_ratio_dict is provided, we will opt to use that. Otherwise, we use will create the sampler_ratio_dict dictionary. Arguments: X (pd.DataFrame): Input features. diff --git a/evalml/pipelines/components/transformers/samplers/oversamplers.py b/evalml/pipelines/components/transformers/samplers/oversamplers.py index 03ea697f69..a7c9c429e4 100644 --- a/evalml/pipelines/components/transformers/samplers/oversamplers.py +++ b/evalml/pipelines/components/transformers/samplers/oversamplers.py @@ -40,8 +40,7 @@ def __init__( class SMOTENCOversampler(BaseOversampler): - """SMOTENC Oversampler component. Uses SMOTENC to generate synthetic samples. Works on a mix of numerical and categorical columns. - Input data must be Woodwork type, and this component is only run during training and not during predict. + """SMOTENC Oversampler component. Uses SMOTENC to generate synthetic samples. Works on a mix of numerical and categorical columns. Input data must be Woodwork type, and this component is only run during training and not during predict. Arguments: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio @@ -92,9 +91,7 @@ def fit(self, X, y): class SMOTENOversampler(BaseOversampler): - """ - SMOTEN Oversampler component. Uses SMOTEN to generate synthetic samples. Works for purely categorical datasets. - This component is only run during training and not during predict. + """SMOTEN Oversampler component. Uses SMOTEN to generate synthetic samples. Works for purely categorical datasets. This component is only run during training and not during predict. Arguments: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index 21440e0d7d..091ce9530f 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -9,8 +9,7 @@ class Undersampler(BaseSampler): - """ - Initializes an undersampling transformer to downsample the majority classes in the dataset. + """Initializes an undersampling transformer to downsample the majority classes in the dataset. This component is only run during training and not during predict. diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index 4ad6393de8..168c6735ce 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -14,7 +14,6 @@ class StandardScaler(Transformer): Arguments: random_seed (int): Seed for the random number generator. Defaults to 0. - """ name = "Standard Scaler" diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 9a256d56c7..13d8316f01 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -12,8 +12,7 @@ class Transformer(ComponentBase): - """A component that may or may not need fitting that transforms data. - These components are used before an estimator. + """A component that may or may not need fitting that transforms data. These components are used before an estimator. To implement a new Transformer, define your own class which is a subclass of Transformer, including a name and a list of acceptable ranges for any parameters to be tuned during the automl search (hyperparameters). @@ -59,7 +58,7 @@ def transform(self, X, y=None): ) def fit_transform(self, X, y=None): - """Fits on X and transforms X + """Fits on X and transforms X. Arguments: X (pd.DataFrame): Data to fit and transform diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 8b9dc23fd4..b49dbf8c2e 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -131,8 +131,7 @@ class WrappedSKClassifier(BaseEstimator, ClassifierMixin): """Scikit-learn classifier wrapper class.""" def __init__(self, pipeline): - """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input - and returns a scikit-learn classifier class wrapping that pipeline. + """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline. Arguments: pipeline (PipelineBase or subclass obj): EvalML pipeline @@ -145,7 +144,7 @@ def __init__(self, pipeline): self.classes_ = pipeline.classes_ def fit(self, X, y): - """Fits component to data + """Fits component to data. Arguments: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] @@ -190,8 +189,7 @@ class WrappedSKRegressor(BaseEstimator, RegressorMixin): """Scikit-learn regressor wrapper class.""" def __init__(self, pipeline): - """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input - and returns a scikit-learn regressor class wrapping that pipeline. + """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline. Arguments: pipeline (PipelineBase or subclass obj): EvalML pipeline @@ -203,7 +201,7 @@ def __init__(self, pipeline): self._is_fitted = True def fit(self, X, y): - """Fits component to data + """Fits component to data. Arguments: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] @@ -295,9 +293,7 @@ def generate_component_code(element): def make_balancing_dictionary(y, sampling_ratio): - """Makes dictionary for oversampler components. Find ratio of each class to the majority. - If the ratio is smaller than the sampling_ratio, we want to oversample, - otherwise, we don't want to sample at all, and we leave the data as is. + """Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is. Arguments: y (pd.Series): Target data diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 9e02125d9b..455e37fc59 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -38,8 +38,7 @@ class PipelineBase(ABC, metaclass=PipelineBaseMeta): - """ - Machine learning pipeline made out of transformers and an Estimator. + """Machine learning pipeline made out of transformers and an Estimator. Arguments: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. @@ -130,6 +129,7 @@ def name(self): @property def summary(self): """A short summary of the pipeline structure, describing the list of components used. + Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ component_graph = [ @@ -202,19 +202,18 @@ def __setitem__(self, index, value): raise NotImplementedError("Setting pipeline components is not supported.") def get_component(self, name): - """Returns component by name + """Returns component by name. Arguments: name (str): Name of component Returns: Component: Component to return - """ return self.component_graph.get_component(name) def describe(self, return_dict=False): - """Outputs pipeline details including component parameters + """Outputs pipeline details including component parameters. Arguments: return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. @@ -273,7 +272,6 @@ def fit(self, X, y): Returns: self - """ def transform(self, X, y=None): @@ -468,7 +466,7 @@ def graph(self, filepath=None): return graph def graph_feature_importance(self, importance_threshold=0): - """Generate a bar graph of the pipeline's feature importance + """Generate a bar graph of the pipeline's feature importance. Arguments: importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. @@ -515,7 +513,7 @@ def graph_feature_importance(self, importance_threshold=0): return fig def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): - """Saves pipeline at file path + """Saves pipeline at file path. Arguments: file_path (str): location to save file @@ -529,7 +527,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): @staticmethod def load(file_path): - """Loads pipeline at file path + """Loads pipeline at file path. Arguments: file_path (str): location to load file @@ -554,8 +552,7 @@ def clone(self): ) def new(self, parameters, random_seed=0): - """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. - Not to be confused with python's __new__ method. + """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. Not to be confused with python's __new__ method. Arguments: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. @@ -686,7 +683,6 @@ def can_tune_threshold_with_objective(self, objective): Returns: bool: True if the pipeline threshold can be tuned. - """ return ( is_binary(self.problem_type) @@ -705,8 +701,7 @@ def inverse_transform(self, y): return self.component_graph.inverse_transform(y) def get_hyperparameter_ranges(self, custom_hyperparameters): - """ - Returns hyperparameter ranges from all components as a dictionary. + """Returns hyperparameter ranges from all components as a dictionary. Arguments: custom_hyperparameters (dict): Custom hyperparameters for the pipeline. diff --git a/evalml/pipelines/pipeline_meta.py b/evalml/pipelines/pipeline_meta.py index 3698cb160b..02710fc2cb 100644 --- a/evalml/pipelines/pipeline_meta.py +++ b/evalml/pipelines/pipeline_meta.py @@ -5,12 +5,14 @@ class PipelineBaseMeta(BaseMeta): - """Metaclass that overrides creating a new pipeline by wrapping methods with validators and setters""" + """Metaclass that overrides creating a new pipeline by wrapping methods with validators and setters.""" @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + + It raises an exception if `False` and calls and returns the + wrapped method if `True`. """ @wraps(method) diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 7312102a20..5a470367a6 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -40,7 +40,7 @@ def fit(self, X, y): return self def score(self, X, y, objectives): - """Evaluate model performance on current and additional objectives + """Evaluate model performance on current and additional objectives. Arguments: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 870104a384..ec6663fc23 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -159,7 +159,7 @@ def _get_preprocessing_components( def _get_pipeline_base_class(problem_type): - """Returns pipeline base class for problem_type""" + """Returns pipeline base class for problem_type.""" if problem_type == ProblemTypes.BINARY: return BinaryClassificationPipeline elif problem_type == ProblemTypes.MULTICLASS: @@ -183,9 +183,7 @@ def make_pipeline( sampler_name=None, extra_components=None, ): - """Given input data, target data, an estimator class and the problem type, - generates a pipeline class with a preprocessing chain which was recommended based on the inputs. - The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. + """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. Arguments: X (pd.DataFrame): The input data of shape [n_samples, n_features] @@ -200,7 +198,6 @@ def make_pipeline( Returns: PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator - """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -257,8 +254,7 @@ def generate_pipeline_code(element): def _make_stacked_ensemble_pipeline( input_pipelines, problem_type, n_jobs=-1, random_seed=0 ): - """ - Creates a pipeline with a stacked ensemble estimator. + """Creates a pipeline with a stacked ensemble estimator. Arguments: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. @@ -313,8 +309,7 @@ def _make_stacked_ensemble_pipeline( def _make_component_list_from_actions(actions): - """ - Creates a list of components from the input DataCheckAction list + """Creates a list of components from the input DataCheckAction list. Arguments: actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index f3b3d17dde..92225de76e 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -33,11 +33,11 @@ def __init__( @staticmethod def get_n_splits(): - """Returns the number of splits of this object""" + """Returns the number of splits of this object.""" return 1 def split(self, X, y=None): - """Divides the data into training and testing sets + """Divides the data into training and testing sets. Arguments: X (pd.DataFrame): Dataframe of points to split diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 9fa83e6804..7aecc9f983 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -27,8 +27,7 @@ def handle_problem_types(problem_type): def detect_problem_type(y): - """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) - Ignores missing and null data + """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) Ignores missing and null data. Arguments: y (pd.Series): the target labels to predict @@ -54,13 +53,14 @@ def detect_problem_type(y): def is_regression(problem_type): - """Determines if the provided problem_type is a regression problem type + """Determines if the provided problem_type is a regression problem type. Arguments: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a regression problem type.""" + bool: Whether or not the provided problem_type is a regression problem type. + """ return handle_problem_types(problem_type) in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, @@ -68,13 +68,14 @@ def is_regression(problem_type): def is_binary(problem_type): - """Determines if the provided problem_type is a binary classification problem type + """Determines if the provided problem_type is a binary classification problem type. Arguments: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a binary classification problem type.""" + bool: Whether or not the provided problem_type is a binary classification problem type. + """ return handle_problem_types(problem_type) in [ ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY, @@ -82,13 +83,14 @@ def is_binary(problem_type): def is_multiclass(problem_type): - """Determines if the provided problem_type is a multiclass classification problem type + """Determines if the provided problem_type is a multiclass classification problem type. Arguments: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a multiclass classification problem type.""" + bool: Whether or not the provided problem_type is a multiclass classification problem type. + """ return handle_problem_types(problem_type) in [ ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS, @@ -96,24 +98,26 @@ def is_multiclass(problem_type): def is_classification(problem_type): - """Determines if the provided problem_type is a classification problem type + """Determines if the provided problem_type is a classification problem type. Arguments: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a classification problem type.""" + bool: Whether or not the provided problem_type is a classification problem type. + """ return is_binary(problem_type) or is_multiclass(problem_type) def is_time_series(problem_type): - """Determines if the provided problem_type is a time series problem type + """Determines if the provided problem_type is a time series problem type. Arguments: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a time series problem type.""" + bool: Whether or not the provided problem_type is a time series problem type. + """ return handle_problem_types(problem_type) in [ ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, diff --git a/evalml/tests/automl_tests/dask_test_utils.py b/evalml/tests/automl_tests/dask_test_utils.py index 3cb16111f9..cd393d32ab 100644 --- a/evalml/tests/automl_tests/dask_test_utils.py +++ b/evalml/tests/automl_tests/dask_test_utils.py @@ -9,7 +9,7 @@ # Top-level replacement for AutoML object to supply data for testing purposes. def err_call(*args, **kwargs): - """No-op""" + """No-op.""" data_splitter = TrainingValidationSplit() @@ -96,9 +96,11 @@ def score(self, X, y, objectives): class DaskPipelineSlow(BinaryClassificationPipeline): - """Pipeline for testing whose fit() should take longer than the - fast pipeline. This exists solely to test AutoMLSearch termination - and not complete fitting.""" + """Pipeline for testing whose fit() should take longer than the fast pipeline. + + This exists solely to test AutoMLSearch termination and not complete + fitting. + """ component_graph = ["Baseline Classifier"] custom_name = "SlowPipeline" @@ -123,9 +125,11 @@ def fit(self, X, y): class DaskPipelineFast(BinaryClassificationPipeline): - """Pipeline for testing whose fit() should complete before the - slow pipeline. This exists solely to test AutoMLSearch termination - and complete fitting.""" + """Pipeline for testing whose fit() should complete before the slow pipeline. + + This exists solely to test AutoMLSearch termination and complete + fitting. + """ component_graph = ["Baseline Classifier"] custom_name = "FastPipeline" diff --git a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py index fa5102ca9e..9b4f4fd1fc 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py @@ -51,12 +51,12 @@ def process_pool(): def _get_engine_support(parallel_engine_type, thread_pool, cluster): - """Helper function to return the proper combination of resource pool, client class and - engine class for testing purposes. + """Helper function to return the proper combination of resource pool, client class and engine class for testing purposes. - e.g. The CFEngine can be run either with a ThreadPoolExecutor or a ProcessPoolExecutor, - so _get_engine_support("CFEngine", thread_pool, cluster) returns a - tuple of (ThreadPoolExecutor, cf.Client, cf.CFEngine) + e.g. The CFEngine can be run either with a ThreadPoolExecutor or a + ProcessPoolExecutor, so _get_engine_support("CFEngine", + thread_pool, cluster) returns a tuple of (ThreadPoolExecutor, + cf.Client, cf.CFEngine) """ if parallel_engine_type == "CFEngine": resources = thread_pool diff --git a/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py b/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py index 3bf4d1d293..06f397453a 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py +++ b/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py @@ -59,8 +59,7 @@ def test_init(process_pool): def test_submit_training_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) with CFClient(pool) as client: @@ -91,8 +90,7 @@ def test_submit_training_job_single( def test_submit_training_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that training multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that training multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) with CFClient(pool) as client: @@ -136,8 +134,7 @@ def fit_pipelines(pipelines, engine): def test_submit_evaluate_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that evaluating a single pipeline using the parallel engine produces the - same results as simply running the evaluate_pipeline function.""" + """Test that evaluating a single pipeline using the parallel engine produces the same results as simply running the evaluate_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -191,8 +188,7 @@ def test_submit_evaluate_job_single( def test_submit_evaluate_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that evaluating multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that evaluating multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -248,8 +244,7 @@ def eval_pipelines(pipelines, engine): def test_submit_scoring_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that scoring a single pipeline using the parallel engine produces the - same results as simply running the score_pipeline function.""" + """Test that scoring a single pipeline using the parallel engine produces the same results as simply running the score_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -288,8 +283,7 @@ def test_submit_scoring_job_single( def test_submit_scoring_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that scoring multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that scoring multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -344,8 +338,7 @@ def score_pipelines(pipelines, engine): @pytest.mark.parametrize("pool_type", ["threads", "processes"]) def test_cancel_job(X_y_binary_cls, pool_type, thread_pool, process_pool): - """Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) diff --git a/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py b/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py index 33c18ccd92..ec424765f9 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py +++ b/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py @@ -42,8 +42,7 @@ def test_init(cluster): def test_submit_training_job_single(X_y_binary_cls, cluster): - """Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls with Client(cluster) as client: engine = DaskEngine(client=client) @@ -70,8 +69,7 @@ def test_submit_training_job_single(X_y_binary_cls, cluster): def test_submit_training_jobs_multiple(X_y_binary_cls, cluster): - """Test that training multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that training multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls with Client(cluster) as client: pipelines = [ @@ -111,8 +109,7 @@ def fit_pipelines(pipelines, engine): def test_submit_evaluate_job_single(X_y_binary_cls, cluster): - """Test that evaluating a single pipeline using the parallel engine produces the - same results as simply running the evaluate_pipeline function.""" + """Test that evaluating a single pipeline using the parallel engine produces the same results as simply running the evaluate_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -162,8 +159,7 @@ def test_submit_evaluate_job_single(X_y_binary_cls, cluster): def test_submit_evaluate_jobs_multiple(X_y_binary_cls, cluster): - """Test that evaluating multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that evaluating multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -215,8 +211,7 @@ def eval_pipelines(pipelines, engine): def test_submit_scoring_job_single(X_y_binary_cls, cluster): - """Test that scoring a single pipeline using the parallel engine produces the - same results as simply running the score_pipeline function.""" + """Test that scoring a single pipeline using the parallel engine produces the same results as simply running the score_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -251,8 +246,7 @@ def test_submit_scoring_job_single(X_y_binary_cls, cluster): def test_submit_scoring_jobs_multiple(X_y_binary_cls, cluster): - """Test that scoring multiple pipelines using the parallel engine produces the - same results as the sequential engine.""" + """Test that scoring multiple pipelines using the parallel engine produces the same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -305,8 +299,7 @@ def score_pipelines(pipelines, engine): def test_cancel_job(X_y_binary_cls, cluster): - """Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls with Client(cluster) as client: diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 66ebf87d9a..c545d90321 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1640,6 +1640,7 @@ def __init__(self, k, starting_index): def __call__(self): """Raises KeyboardInterrupt on the kth call. + Arguments are ignored but included to meet the call back API. """ if self.n_calls == self.k: @@ -2000,7 +2001,7 @@ def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) def fit(self, *args, **kwargs): - """Mocking fit""" + """Mocking fit.""" class Pipeline1(DummyPipeline): custom_name = "Pipeline1" @@ -2167,7 +2168,7 @@ def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) def fit(self, *args, **kwargs): - """Mocking fit""" + """Mocking fit.""" additional_objectives = None if custom_additional_objective: diff --git a/evalml/tests/component_tests/test_catboost_classifier.py b/evalml/tests/component_tests/test_catboost_classifier.py index d19eb387a4..e8f33d2e9f 100644 --- a/evalml/tests/component_tests/test_catboost_classifier.py +++ b/evalml/tests/component_tests/test_catboost_classifier.py @@ -10,7 +10,7 @@ def test_catboost_classifier_random_seed_bounds_seed(X_y_binary): - """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_catboost_regressor.py b/evalml/tests/component_tests/test_catboost_regressor.py index 84244c56ad..5acbf7fdda 100644 --- a/evalml/tests/component_tests/test_catboost_regressor.py +++ b/evalml/tests/component_tests/test_catboost_regressor.py @@ -10,7 +10,7 @@ def test_catboost_regressor_random_seed_bounds_seed(X_y_regression): - """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 1766024f0a..f121ed8061 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -28,7 +28,7 @@ def test_problem_types(): def test_lightgbm_classifier_random_seed_bounds_seed(X_y_binary): - """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 1b20b23bd4..983b7514b4 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -23,7 +23,7 @@ def test_problem_types(): def test_lightgbm_regressor_random_seed_bounds_seed(X_y_regression): - """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 0298d97357..f6d4650070 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -413,10 +413,7 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components( def test_component_handles_pre_init_ww(): - """Test to determine whether SimpleImputer can handle - a Woodwork-inited DataFrame with partially null and fully - null columns (post Woodwork 0.5.1) and still perform the - expected behavior.""" + """Test to determine whether SimpleImputer can handle a Woodwork-inited DataFrame with partially null and fully null columns (post Woodwork 0.5.1) and still perform the expected behavior.""" df = pd.DataFrame( {"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]} ) diff --git a/evalml/tests/component_tests/test_xgboost_classifier.py b/evalml/tests/component_tests/test_xgboost_classifier.py index 9357572e96..846708e018 100644 --- a/evalml/tests/component_tests/test_xgboost_classifier.py +++ b/evalml/tests/component_tests/test_xgboost_classifier.py @@ -13,7 +13,7 @@ def test_xgboost_classifier_random_seed_bounds_seed(X_y_binary): - """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_xgboost_regressor.py b/evalml/tests/component_tests/test_xgboost_regressor.py index 9b2e809fbf..87c2c764c3 100644 --- a/evalml/tests/component_tests/test_xgboost_regressor.py +++ b/evalml/tests/component_tests/test_xgboost_regressor.py @@ -12,7 +12,7 @@ def test_xgboost_regressor_random_seed_bounds_seed(X_y_regression): - """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" + """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index b80661d0bf..095c2d68eb 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1039,7 +1039,7 @@ def churn_local(): @pytest.fixture def mock_imbalanced_data_X_y(): - """Helper function to return an imbalanced binary or multiclass dataset""" + """Helper function to return an imbalanced binary or multiclass dataset.""" def _imbalanced_data_X_y(problem_type, categorical_columns, size): """ "Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns. @@ -1153,8 +1153,7 @@ def _patch_method(self, method, side_effect, return_value, pipeline_class_str=No return patch(pipeline_class_str + "." + method, **kwargs) def _reset_mocks(self): - """Set the mocks to None before running a computation so that we can prevent users from trying to access - them before leaving the context manager.""" + """Set the mocks to None before running a computation so that we can prevent users from trying to access them before leaving the context manager.""" self._mock_fit = None self._mock_tell = None self._mock_score = None @@ -1205,8 +1204,7 @@ def test_context( predict_proba_return_value=None, optimize_threshold_return_value=0.2, ): - """A context manager for creating an environment that patches time-consuming pipeline methods. - Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. + """A context manager for creating an environment that patches time-consuming pipeline methods. Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. Arguments: score_return_value: Passed as the return_value argument of the pipeline.score patch. diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 10099a9992..58393ab572 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -22,7 +22,7 @@ def test_data_check_name(mock_data_check_class): assert mock_data_check_class.name == "MockDataCheck" class Funky_Name1DataCheck(mock_data_check_class): - """Mock data check with a funky name""" + """Mock data check with a funky name.""" assert Funky_Name1DataCheck().name == "Funky_Name1DataCheck" assert Funky_Name1DataCheck.name == "Funky_Name1DataCheck" diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 1c495ee5cb..772cae3c58 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -564,7 +564,7 @@ class MockCheck(DataCheck): name = "mock_check" def __init__(self, foo, bar, baz=3): - """Mock init""" + """Mock init.""" def validate(self, X, y=None): """Mock validate.""" @@ -574,7 +574,7 @@ class MockCheck2(DataCheck): name = "MockCheck" def __init__(self, foo, bar, baz=3): - """Mock init""" + """Mock init.""" def validate(self, X, y=None): """Mock validate.""" diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py index 558c37ff68..68c99a3ad0 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py @@ -33,8 +33,10 @@ def make_test_pipeline(estimator, base_class): """Make an estimator-only pipeline. - This is helps test the exceptions raised in _compute_shap_values without having to use make_pipeline - (which needs training data to be passed in). + + This is helps test the exceptions raised in _compute_shap_values + without having to use make_pipeline (which needs training data to be + passed in). """ class Pipeline(base_class): diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index d057b496e1..8e7e817a51 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1311,9 +1311,10 @@ def test_graph_partial_dependence_ice_plot_two_way_error( def test_partial_dependence_scale_error(): - """Test to catch the case when the scale of the features is so small - that the 5th and 95th percentiles are too close to each other. This is - an sklearn exception.""" + """Test to catch the case when the scale of the features is so small that the 5th and 95th percentiles are too close to each other. + + This is an sklearn exception. + """ pl = RegressionPipeline(["Random Forest Regressor"]) X = pd.DataFrame({"a": list(range(30)), "b": list(range(-10, 20))}) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 1833cf1b7a..168e9f5093 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -22,9 +22,10 @@ class DoubleColumns(Transformer): """Custom transformer for testing permutation importance implementation. - We don't have any transformers that create features that you can repeatedly "stack" on the previous output. - That being said, I want to test that our implementation can handle that case in the event we add a transformer like - that in the future. + We don't have any transformers that create features that you can + repeatedly "stack" on the previous output. That being said, I want + to test that our implementation can handle that case in the event we + add a transformer like that in the future. """ name = "DoubleColumns" diff --git a/evalml/tests/objective_tests/test_binary_classification_objective.py b/evalml/tests/objective_tests/test_binary_classification_objective.py index 5c009e266a..1b4254af22 100644 --- a/evalml/tests/objective_tests/test_binary_classification_objective.py +++ b/evalml/tests/objective_tests/test_binary_classification_objective.py @@ -74,7 +74,7 @@ def assign_problem_type(self): @abstractmethod def assign_objective(self, **kwargs): - """Get objective object using specified parameters""" + """Get objective object using specified parameters.""" def run_pipeline(self, X_y_binary, **kwargs): self.X, self.y = X_y_binary @@ -95,7 +95,7 @@ def run_pipeline(self, X_y_binary, **kwargs): @abstractmethod def test_score(self, y_true, y_predicted, expected_score): - """Objective score matches expected score + """Objective score matches expected score. Args: y_true (pd.Series): true classes @@ -105,7 +105,7 @@ def test_score(self, y_true, y_predicted, expected_score): @abstractmethod def test_all_base_tests(self): - """Run all relevant tests from the base class""" + """Run all relevant tests from the base class.""" @pytest.fixture(scope="class") def fix_y_pred_na(self): diff --git a/evalml/tests/objective_tests/test_objectives.py b/evalml/tests/objective_tests/test_objectives.py index b8db97c885..136c1ea119 100644 --- a/evalml/tests/objective_tests/test_objectives.py +++ b/evalml/tests/objective_tests/test_objectives.py @@ -23,7 +23,7 @@ def test_create_custom_objective(): class MockEmptyObjective(ObjectiveBase): def objective_function(self, y_true, y_predicted, X=None): - """Docstring for mock objective function""" + """Docstring for mock objective function.""" with pytest.raises(TypeError): MockEmptyObjective() diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py index 330ce4ca39..a12e44e87d 100644 --- a/evalml/tests/pipeline_tests/test_component_graph.py +++ b/evalml/tests/pipeline_tests/test_component_graph.py @@ -54,15 +54,15 @@ def fit(self, X, y): class TransformerA(DummyTransformer): - """copy class""" + """copy class.""" class TransformerB(DummyTransformer): - """copy class""" + """copy class.""" class TransformerC(DummyTransformer): - """copy class""" + """copy class.""" class DummyEstimator(Estimator): @@ -81,15 +81,15 @@ def fit(self, X, y): class EstimatorA(DummyEstimator): - """copy class""" + """copy class.""" class EstimatorB(DummyEstimator): - """copy class""" + """copy class.""" class EstimatorC(DummyEstimator): - """copy class""" + """copy class.""" @pytest.fixture diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py index 02f9f5b4bd..1fab9927c2 100644 --- a/evalml/tests/utils_tests/test_woodwork_utils.py +++ b/evalml/tests/utils_tests/test_woodwork_utils.py @@ -280,10 +280,10 @@ def test_ordinal_retains_order_min(): ), ) def test_infer_feature_types_NA_to_nan(null_col, already_inited): - """A short test to make sure that columns with all null values - get converted from woodwork Unknown logical type with string - physical type back to the original Double logical type with - float physical type. Other Unknown columns should remain unchanged.""" + """A short test to make sure that columns with all null values get converted from woodwork Unknown logical type with string physical type back to the original Double logical type with float physical type. + + Other Unknown columns should remain unchanged. + """ df = pd.DataFrame( { diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index b4205532d8..78585aebcc 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -55,8 +55,7 @@ def __init__(self, pipeline_hyperparameter_ranges, n_points=10, random_seed=0): self.curr_params = None def add(self, pipeline_parameters, score): - """Not applicable to grid search tuner as generated parameters are - not dependent on scores of previous parameters. + """Not applicable to grid search tuner as generated parameters are not dependent on scores of previous parameters. Arguments: pipeline_parameters (dict): a dict of the parameters used to evaluate a pipeline @@ -65,7 +64,7 @@ def add(self, pipeline_parameters, score): pass def propose(self): - """Returns parameters from _grid_points iterations + """Returns parameters from _grid_points iterations. If all possible combinations of parameters have been scored, then ``NoParamsException`` is raised. @@ -79,8 +78,7 @@ def propose(self): return self._convert_to_pipeline_parameters(params) def is_search_space_exhausted(self): - """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in - ``self.curr_params`` to be returned by ``propose()``. + """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in ``self.curr_params`` to be returned by ``propose()``. Raises: NoParamsException: If a search space is exhausted, then this exception is thrown. diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index d1fc9a484e..7cbf25dfa9 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -39,8 +39,7 @@ def __init__( self.curr_params = None def add(self, pipeline_parameters, score): - """Not applicable to random search tuner as generated parameters are - not dependent on scores of previous parameters. + """Not applicable to random search tuner as generated parameters are not dependent on scores of previous parameters. Arguments: pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline @@ -71,8 +70,7 @@ def propose(self): return self._convert_to_pipeline_parameters(params) def is_search_space_exhausted(self): - """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in - ``self.curr_params`` to be returned by ``propose()``. + """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in ``self.curr_params`` to be returned by ``propose()``. Raises: NoParamsException: If a search space is exhausted, then this exception is thrown. diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 5cc1f7bfe8..1143237d7b 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -54,7 +54,7 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): self._search_space_ranges.append(parameter_range) def _convert_to_flat_parameters(self, pipeline_parameters): - """Convert from pipeline parameters to a flat list of values""" + """Convert from pipeline parameters to a flat list of values.""" flat_parameter_values = [] for flat_parameter_name in self._search_space_names: component_name, parameter_name = self._parameter_names_map[ @@ -75,7 +75,7 @@ def _convert_to_flat_parameters(self, pipeline_parameters): return flat_parameter_values def _convert_to_pipeline_parameters(self, flat_parameters): - """Convert from a flat list of values to a dict of pipeline parameters""" + """Convert from a flat list of values to a dict of pipeline parameters.""" pipeline_parameters = { component_name: dict() for component_name in self._component_names } diff --git a/evalml/utils/base_meta.py b/evalml/utils/base_meta.py index 3857350c20..490b81c0ba 100644 --- a/evalml/utils/base_meta.py +++ b/evalml/utils/base_meta.py @@ -3,7 +3,7 @@ class BaseMeta(ABCMeta): - """Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters""" + """Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters.""" FIT_METHODS = ["fit", "fit_transform"] METHODS_TO_CHECK = ["predict", "predict_proba", "transform", "inverse_transform"] diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index eef642b971..191b206bdf 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -18,8 +18,7 @@ def import_or_raise(library, error_msg=None, warning=False): - """Attempts to import the requested library by name. - If the import fails, raises an ImportError or warning. + """Attempts to import the requested library by name. If the import fails, raises an ImportError or warning. Arguments: library (str): the name of the library @@ -186,8 +185,7 @@ def _get_subclasses(base_class): def get_importable_subclasses(base_class, used_in_automl=True): - """Get importable subclasses of a base class. Used to list all of our - estimators, transformers, components and pipelines dynamically. + """Get importable subclasses of a base class. Used to list all of our estimators, transformers, components and pipelines dynamically. Arguments: base_class (abc.ABCMeta): Base class to find all of the subclasses for. @@ -222,9 +220,7 @@ def get_importable_subclasses(base_class, used_in_automl=True): def _rename_column_names_to_numeric(X, flatten_tuples=True): - """Used in LightGBM and XGBoost estimator classes to rename column names - when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) - that these estimators cannot natively handle. + """Used in LightGBM and XGBoost estimator classes to rename column names when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that these estimators cannot natively handle. Arguments: X (pd.DataFrame): The input training data of shape [n_samples, n_features] @@ -268,7 +264,7 @@ def jupyter_check(): def safe_repr(value): - """Convert the given value into a string that can safely be used for repr + """Convert the given value into a string that can safely be used for repr. Arguments: value: the item to convert @@ -285,7 +281,7 @@ def safe_repr(value): def is_all_numeric(df): - """Checks if the given DataFrame contains only numeric values + """Checks if the given DataFrame contains only numeric values. Arguments: df (pd.DataFrame): The DataFrame to check data types of. diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 1adb2269cc..b4cbaee625 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -45,8 +45,7 @@ def _raise_value_error_if_nullable_types_detected(data): def infer_feature_types(data, feature_types=None): - """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. - If a column's type is not specified, it will be inferred by Woodwork. + """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork. Arguments: data (pd.DataFrame, pd.Series): Input data to convert to a Woodwork data structure. @@ -113,10 +112,7 @@ def is_column_unknown(data, col): def _retain_custom_types_and_initalize_woodwork( old_logical_types, new_dataframe, ltypes_to_ignore=None ): - """ - Helper method which will take an old Woodwork data structure and a new pandas data structure and return a - new data structure that will try to retain as many logical types from the old data structure that exist in the new - pandas data structure as possible. + """Helper method which will take an old Woodwork data structure and a new pandas data structure and return a new data structure that will try to retain as many logical types from the old data structure that exist in the new pandas data structure as possible. Arguments: old_logical_types (Dict): Logical types to try to retain. @@ -153,15 +149,15 @@ def _retain_custom_types_and_initalize_woodwork( def _convert_numeric_dataset_pandas(X, y): - """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. - Used with data sampler strategies. + """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. Used with data sampler strategies. Arguments: X (pd.DataFrame, np.ndarray): Data to transform y (pd.Series, np.ndarray): Target data Returns: - Tuple(pd.DataFrame, pd.Series): Transformed X and y""" + Tuple(pd.DataFrame, pd.Series): Transformed X and y + """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError( From 542eb74c7533775ebc76add81914902a69142569 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 26 Aug 2021 16:43:38 -0400 Subject: [PATCH 03/62] do some more cleanup --- Makefile | 2 +- evalml/demos/breast_cancer.py | 3 ++- evalml/demos/churn.py | 3 ++- evalml/demos/diabetes.py | 3 ++- evalml/demos/fraud.py | 4 +++- evalml/demos/wine.py | 7 +++++-- evalml/preprocessing/data_splitters/time_series_split.py | 8 ++++++-- 7 files changed, 21 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 371477126f..f9ed79c1cd 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml + pydocstyle evalml --ignore=D107 black evalml -t py39 --check .PHONY: lint-fix diff --git a/evalml/demos/breast_cancer.py b/evalml/demos/breast_cancer.py index fd83efdbd6..964af99c43 100644 --- a/evalml/demos/breast_cancer.py +++ b/evalml/demos/breast_cancer.py @@ -8,7 +8,8 @@ def load_breast_cancer(): """Load breast cancer dataset. Binary classification problem. - Returns: + Return: + ------ (pd.Dataframe, pd.Series): X and y """ filepath = ( diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index 05336a659c..f39975cc50 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -10,7 +10,8 @@ def load_churn(n_rows=None, verbose=True): n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Returns: + Return: + ------ (pd.Dataframe, pd.Series): X and y """ churn_data_path = ( diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index 9642b28840..a5b47dbbd5 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -9,7 +9,8 @@ def load_diabetes(): """Load diabetes dataset. Used for regression problem. - Returns: + Return: + ------ (pd.Dataframe, pd.Series): X and y """ filename = ( diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index bc56e72a3c..07771f16ff 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -9,10 +9,12 @@ def load_fraud(n_rows=None, verbose=True): The fraud dataset can be used for binary classification problems. Arguments: + --------- n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Returns: + Return: + ------ (pd.Dataframe, pd.Series): X and y """ fraud_data_path = ( diff --git a/evalml/demos/wine.py b/evalml/demos/wine.py index e9fee6dba3..5ce86196f2 100644 --- a/evalml/demos/wine.py +++ b/evalml/demos/wine.py @@ -6,9 +6,12 @@ def load_wine(): - """Load wine dataset. Multiclass problem. + """Load wine dataset. - Returns: + Multiclass problem. + + Return: + ------ (pd.Dataframe, pd.Series): X and y """ filepath = ( diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 5bb3074868..677c3193d2 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -1,3 +1,4 @@ +"""Rolling Origin Cross Validation for time series problems.""" import numpy as np from sklearn.model_selection import TimeSeriesSplit as SkTimeSeriesSplit from sklearn.model_selection._split import BaseCrossValidator @@ -12,6 +13,7 @@ class TimeSeriesSplit(BaseCrossValidator): then set max_delay and gap to 0. Arguments: + --------- max_delay (int): Max delay value for feature engineering. Time series pipelines create delayed features from existing features. This process will introduce NaNs into the first max_delay number of rows. The splitter uses the last max_delay number of rows from the previous split as the first max_delay number @@ -44,11 +46,13 @@ def split(self, X, y=None, groups=None): at the same time. Arguments: + --------- X (pd.DataFrame, None): Features to split. y (pd.DataFrame, None): Target variable to split. - groups: Ignored but kept for compatibility with sklearn api. + groups: Ignored but kept for compatibility with sklearn API. - Returns: + Return: + ------ Iterator of (train, test) indices tuples. """ # Sklearn splitters always assume a valid X is passed but we need to support the From 28e67a2ba87edfceeb97b6ad9eb8285afa8d5a83 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 02:03:46 -0400 Subject: [PATCH 04/62] fix some data checks --- Makefile | 3 +-- .../data_checks/class_imbalance_data_check.py | 12 +++++++-- evalml/data_checks/data_check_action.py | 10 ++++++-- evalml/data_checks/data_checks.py | 25 +++++++++++-------- evalml/data_checks/default_data_checks.py | 3 +++ evalml/data_checks/highly_null_data_check.py | 12 ++++++--- evalml/data_checks/id_columns_data_check.py | 12 ++++++--- .../natural_language_nan_data_check.py | 8 ++++-- evalml/data_checks/no_variance_data_check.py | 12 ++++++--- evalml/data_checks/sparsity_data_check.py | 17 +++++++++---- evalml/data_checks/utils.py | 13 ++++++---- evalml/demos/churn.py | 1 + evalml/demos/wine.py | 4 +-- 13 files changed, 91 insertions(+), 41 deletions(-) diff --git a/Makefile b/Makefile index f9ed79c1cd..82c0957ba0 100644 --- a/Makefile +++ b/Makefile @@ -9,12 +9,11 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml --ignore=D107 + pydocstyle evalml --ignore=D107,D203,D212 black evalml -t py39 --check .PHONY: lint-fix lint-fix: - docformatter --in-place . -r --wrap-summaries 0 black -t py39 evalml isort evalml python docs/notebook_version_standardizer.py standardize diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 87dd83a8e3..9913e31de6 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -1,3 +1,7 @@ +"""Data check that checks if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. + +Use for classification problems. +""" from evalml.data_checks import ( DataCheck, DataCheckError, @@ -11,6 +15,7 @@ class ClassImbalanceDataCheck(DataCheck): """Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. Arguments: + --------- threshold (float): The minimum threshold allowed for class imbalance before a warning is raised. This threshold is calculated by comparing the number of samples in each class to the sum of samples in that class and the majority class. For example, a multiclass case with [900, 900, 100] samples per classes 0, 1, and 2, respectively, @@ -42,17 +47,20 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): self.cv_folds = num_cv_folds * 2 def validate(self, X, y): - """Checks if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. + """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target labels to check for imbalanced data. - Returns: + Return: + ------ dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. Example: + ------- >>> import pandas as pd >>> X = pd.DataFrame() >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index f7fa85024e..4e34f19ad0 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -1,9 +1,14 @@ +"""Recommended action returned by a DataCheck.""" + + class DataCheckAction: - """A recommended action returned by a DataCheck. + """Recommended action returned by a DataCheck. Arguments: + --------- action_code (DataCheckActionCode): Action code associated with the action. metadata (dict, optional): Additional useful information associated with the action. Defaults to None. + """ def __init__(self, action_code, metadata=None): @@ -11,7 +16,7 @@ def __init__(self, action_code, metadata=None): self.metadata = metadata or {} def __eq__(self, other): - """Checks for equality. + """Check for equality. Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent. @@ -19,5 +24,6 @@ def __eq__(self, other): return self.action_code == other.action_code and self.metadata == other.metadata def to_dict(self): + """Return a dictionary form of the data check action.""" action_dict = {"code": self.action_code.name, "metadata": self.metadata} return action_dict diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 7bb3887b28..c2cbe0c4fc 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -1,3 +1,4 @@ +"""A collection of data checks.""" import inspect from evalml.data_checks import DataCheck @@ -6,7 +7,7 @@ def _has_defaults_for_all_args(init): - """Tests whether the init method has defaults for all arguments.""" + """Test whether the init method has defaults for all arguments.""" signature = inspect.getfullargspec(init) n_default_args = 0 if not signature.defaults else len(signature.defaults) n_args = ( @@ -16,11 +17,17 @@ def _has_defaults_for_all_args(init): class DataChecks: - """A collection of data checks.""" + """A collection of data checks. + + Arguments: + --------- + data_checks (list (DataCheck)): List of DataCheck objects. + data_check_params (dict): Parameters for passed DataCheck objects. + """ @staticmethod def _validate_data_checks(data_check_classes, params): - """Inits a DataChecks instance from a list of DataCheck classes and corresponding params.""" + """Init a DataChecks instance from a list of DataCheck classes and corresponding params.""" if not isinstance(data_check_classes, list): raise ValueError( f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}." @@ -73,25 +80,21 @@ def _init_data_checks(data_check_classes, params): return data_check_instances def __init__(self, data_checks=None, data_check_params=None): - """A collection of data checks. - - Arguments: - data_checks (list (DataCheck)): List of DataCheck objects - data_check_params (dict): Parameters for passed DataCheck objects - """ data_check_params = data_check_params or dict() self._validate_data_checks(data_checks, data_check_params) data_check_instances = self._init_data_checks(data_checks, data_check_params) self.data_checks = data_check_instances def validate(self, X, y=None): - """Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable. + """Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. Arguments: + --------- X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] - Returns: + Return: + ------ dict: Dictionary containing DataCheckMessage objects """ messages = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 27fa073e2d..2b4114708a 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -1,3 +1,4 @@ +"""A default set of data checks that can be used for a variety of datasets.""" from .class_imbalance_data_check import ClassImbalanceDataCheck from .data_checks import DataChecks from .datetime_format_data_check import DateTimeFormatDataCheck @@ -19,6 +20,7 @@ class DefaultDataChecks(DataChecks): """A collection of basic data checks that is used by AutoML by default. + Includes: - `HighlyNullDataCheck` @@ -34,6 +36,7 @@ class DefaultDataChecks(DataChecks): - `DateTimeFormatDataCheck` (for time series problem types) Arguments: + --------- problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 334bbb7c56..c3880f4992 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -1,3 +1,5 @@ +"""Data check that checks if there are any highly-null columns and rows in the input.""" + from evalml.data_checks import ( DataCheck, DataCheckAction, @@ -9,9 +11,10 @@ class HighlyNullDataCheck(DataCheck): - """Checks if there are any highly-null columns and rows in the input. + """Check if there are any highly-null columns and rows in the input. Arguments: + --------- pct_null_col_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, that column will be considered highly-null. Defaults to 0.95. pct_null_row_threshold(float): If the percentage of NaN values in an input row exceeds this amount, @@ -32,16 +35,19 @@ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): self.pct_null_row_threshold = pct_null_row_threshold def validate(self, X, y=None): - """Checks if there are any highly-null columns or rows in the input. + """Check if there are any highly-null columns or rows in the input. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. - Returns: + Return: + ------ dict: dict with a DataCheckWarning if there are any highly-null columns or rows. Example: + ------- >>> import pandas as pd >>> class SeriesWrap(): ... def __init__(self, series): diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index 8330943c37..41b29cdb23 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if any of the features are likely to be ID columns.""" from evalml.data_checks import ( DataCheck, DataCheckAction, @@ -12,6 +13,7 @@ class IDColumnsDataCheck(DataCheck): """Check if any of the features are likely to be ID columns. Arguments: + --------- id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0. """ @@ -21,19 +23,23 @@ def __init__(self, id_threshold=1.0): self.id_threshold = id_threshold def validate(self, X, y=None): - """Check if any of the features are likely to be ID columns. Currently performs these simple checks: + """Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. - column name is "id" - column name ends in "_id" - column contains all unique values (and is categorical / integer type) Arguments: - X (pd.DataFrame, np.ndarray): The input features to check + --------- + X (pd.DataFrame, np.ndarray): The input features to check.T + y (pd.Series): The target. Defaults to None. Ignored. - Returns: + Return: + ------ dict: A dictionary of features with column name or index and their probability of being ID columns Example: + ------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'df_id': [0, 1, 2, 3, 4], diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 5d2c4a9fa0..8d02475021 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks each column in the input for natural language features and will issue an error if NaN values are present.""" from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode from evalml.utils.woodwork_utils import infer_feature_types @@ -8,16 +9,19 @@ class NaturalLanguageNaNDataCheck(DataCheck): """Checks each column in the input for natural language features and will issue an error if NaN values are present.""" def validate(self, X, y=None): - """Checks if any natural language columns contain NaN values. + """Check if any natural language columns contain NaN values. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns: + Return: + ------ dict: dict with a DataCheckError if NaN values are present in natural language columns. Example: + ------- >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 6a2d6e86b0..9221d091d2 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if the target or any of the features have no variance.""" from evalml.data_checks import ( DataCheck, DataCheckAction, @@ -16,6 +17,7 @@ class NoVarianceDataCheck(DataCheck): """Check if the target or any of the features have no variance. Arguments: + --------- count_nan_as_value (bool): If True, missing values will be counted as their own unique value. Additionally, if true, will return a DataCheckWarning instead of an error if the feature has mostly missing data and only one unique value. @@ -26,14 +28,16 @@ def __init__(self, count_nan_as_value=False): self._dropnan = not count_nan_as_value def _check_for_errors(self, column_name, count_unique, any_nulls): - """Checks if a column has no variance. + """Check if a column has no variance. Arguments: + --------- column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. - Returns: + Return: + ------ DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. """ message = f"{column_name} has {int(count_unique)} unique value." @@ -60,10 +64,12 @@ def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Arguments: + --------- X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. - Returns: + Return: + ------ dict: dict of warnings/errors corresponding to features or target with no variance. """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index ea792b3a49..078b78337d 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if there are any columns with sparsely populated values in the input.""" from evalml.data_checks import ( DataCheck, DataCheckAction, @@ -12,9 +13,10 @@ class SparsityDataCheck(DataCheck): - """Checks if there are any columns with sparsely populated values in the input. + """Check if there are any columns with sparsely populated values in the input. Arguments: + --------- problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. threshold (float): The threshold value, or percentage of each column's unique values, @@ -36,16 +38,19 @@ def __init__(self, problem_type, threshold, unique_count_threshold=10): raise ValueError("Unique count threshold must be positive integer.") def validate(self, X, y=None): - """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. + """Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. - Returns: + Return: + ------ dict: dict with a DataCheckWarning if there are any sparse columns. Example: + ------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], @@ -95,14 +100,16 @@ def validate(self, X, y=None): @staticmethod def sparsity_score(col, count_threshold=10): - """This function calculates a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. + """Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. Arguments: + --------- col (pd.Series): Feature values. count_threshold (int): The number of instances below which a value is considered sparse. Default is 10. - Returns: + Return: + ------ (float): Sparsity score, or the percentage of the unique values that exceed count_threshold. """ counts = col.value_counts() diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index fb2f4182ae..b338baac2c 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -1,11 +1,14 @@ +"""Utilities for data checks.""" from .data_checks import DataChecks class EmptyDataChecks(DataChecks): - def __init__(self, data_checks=None): - """An empty collection of data checks. + """An empty collection of data checks. + + Arguments: + --------- + data_checks (list (DataCheck)): Ignored. + """ - Arguments: - data_checks (list (DataCheck)): Ignored. - """ + def __init__(self, data_checks=None): self.data_checks = [] diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index f39975cc50..9756666c1f 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -7,6 +7,7 @@ def load_churn(n_rows=None, verbose=True): """Load churn dataset, which can be used for binary classification problems. Arguments: + --------- n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels diff --git a/evalml/demos/wine.py b/evalml/demos/wine.py index 5ce86196f2..896776ec05 100644 --- a/evalml/demos/wine.py +++ b/evalml/demos/wine.py @@ -6,9 +6,7 @@ def load_wine(): - """Load wine dataset. - - Multiclass problem. + """Load wine dataset. Multiclass problem. Return: ------ From 3752f8b9b286c9cb41841968c53650f892a38c01 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 02:06:28 -0400 Subject: [PATCH 05/62] start to add lines --- evalml/data_checks/highly_null_data_check.py | 8 ++++++-- .../data_checks/natural_language_nan_data_check.py | 4 +++- evalml/data_checks/no_variance_data_check.py | 12 +++++++++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index c3880f4992..6a1dc066d1 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -11,7 +11,8 @@ class HighlyNullDataCheck(DataCheck): - """Check if there are any highly-null columns and rows in the input. + """ + Check if there are any highly-null columns and rows in the input. Arguments: --------- @@ -19,6 +20,7 @@ class HighlyNullDataCheck(DataCheck): that column will be considered highly-null. Defaults to 0.95. pct_null_row_threshold(float): If the percentage of NaN values in an input row exceeds this amount, that row will be considered highly-null. Defaults to 0.95. + """ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): @@ -35,7 +37,8 @@ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): self.pct_null_row_threshold = pct_null_row_threshold def validate(self, X, y=None): - """Check if there are any highly-null columns or rows in the input. + """ + Check if there are any highly-null columns or rows in the input. Arguments: --------- @@ -78,6 +81,7 @@ def validate(self, X, y=None): ... "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}], ... "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}}, ... {"code": "DROP_COL", "metadata": {"column": "lots_of_null"}}]} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 8d02475021..48ff277b27 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -9,7 +9,8 @@ class NaturalLanguageNaNDataCheck(DataCheck): """Checks each column in the input for natural language features and will issue an error if NaN values are present.""" def validate(self, X, y=None): - """Check if any natural language columns contain NaN values. + """ + Check if any natural language columns contain NaN values. Arguments: --------- @@ -39,6 +40,7 @@ def validate(self, X, y=None): ... message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, ... details={"columns": 'A'}).to_dict()] ... } + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 9221d091d2..08bab62578 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -14,7 +14,8 @@ class NoVarianceDataCheck(DataCheck): - """Check if the target or any of the features have no variance. + """ + Check if the target or any of the features have no variance. Arguments: --------- @@ -22,13 +23,15 @@ class NoVarianceDataCheck(DataCheck): Additionally, if true, will return a DataCheckWarning instead of an error if the feature has mostly missing data and only one unique value. Defaults to False. + """ def __init__(self, count_nan_as_value=False): self._dropnan = not count_nan_as_value def _check_for_errors(self, column_name, count_unique, any_nulls): - """Check if a column has no variance. + """ + Check if a column has no variance. Arguments: --------- @@ -39,6 +42,7 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): Return: ------ DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. + """ message = f"{column_name} has {int(count_unique)} unique value." @@ -61,7 +65,8 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): ) def validate(self, X, y): - """Check if the target or any of the features have no variance (1 unique value). + """ + Check if the target or any of the features have no variance (1 unique value). Arguments: --------- @@ -71,6 +76,7 @@ def validate(self, X, y): Return: ------ dict: dict of warnings/errors corresponding to features or target with no variance. + """ results = {"warnings": [], "errors": [], "actions": []} From 4ba3d25b52c3e15c0f4bf532a541fddbc647f181 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 11:40:03 -0400 Subject: [PATCH 06/62] dashes under sections --- evalml/automl/engine/cf_engine.py | 6 +++--- evalml/automl/engine/dask_engine.py | 8 ++++---- evalml/automl/utils.py | 3 ++- .../data_checks/class_imbalance_data_check.py | 15 +++++++++----- evalml/data_checks/data_check_action.py | 10 ++++++---- evalml/data_checks/data_check_message.py | 16 ++++++++++----- evalml/data_checks/data_checks.py | 12 +++++++---- evalml/data_checks/datetime_nan_data_check.py | 2 ++ evalml/data_checks/default_data_checks.py | 4 +++- evalml/data_checks/highly_null_data_check.py | 6 +++--- evalml/data_checks/id_columns_data_check.py | 12 +++++++---- .../data_checks/invalid_targets_data_check.py | 4 +++- .../multicollinearity_data_check.py | 4 +++- .../natural_language_nan_data_check.py | 4 ++-- evalml/data_checks/no_variance_data_check.py | 8 ++++---- evalml/data_checks/outliers_data_check.py | 8 ++++++-- evalml/data_checks/sparsity_data_check.py | 20 ++++++++++++------- .../target_distribution_data_check.py | 12 ++++++++--- .../data_checks/target_leakage_data_check.py | 16 ++++++++++++--- evalml/data_checks/uniqueness_data_check.py | 8 ++++++-- evalml/data_checks/utils.py | 4 +++- evalml/demos/breast_cancer.py | 2 +- evalml/demos/churn.py | 2 +- evalml/demos/diabetes.py | 2 +- evalml/demos/fraud.py | 2 +- evalml/demos/wine.py | 2 +- .../data_splitters/time_series_split.py | 2 +- 27 files changed, 128 insertions(+), 66 deletions(-) diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index 5060725fdd..bb84fe9204 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -97,7 +97,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat pipeline (pipeline.PipelineBase): pipeline to evaluate X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns CFComputation: an object wrapping a reference to a future-like computation occurring in the resource pool """ @@ -120,7 +120,7 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns CFComputation: an object wrapping a reference to a future-like computation occurring in the resource pool """ @@ -139,7 +139,7 @@ def submit_scoring_job( pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns CFComputation: a object wrapping a reference to a future-like computation occurring in the resource pool """ diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index fa5ec84ab7..6a61b6fd16 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -69,7 +69,7 @@ def send_data_to_cluster(self, X, y): Arguments: X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns dask.Future: the modeling data """ data_hash = joblib.hash(X), joblib.hash(y) @@ -90,7 +90,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat pipeline (pipeline.PipelineBase): pipeline to evaluate X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns DaskComputation: a object wrapping a reference to a future-like computation occurring in the dask cluster """ @@ -114,7 +114,7 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns DaskComputation: a object wrapping a reference to a future-like computation occurring in the dask cluster """ @@ -134,7 +134,7 @@ def submit_scoring_job( pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling - Return: + Returns DaskComputation: a object wrapping a reference to a future-like computation occurring in the dask cluster """ diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 357fd32816..2c5b74adae 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -35,7 +35,8 @@ def get_default_primary_search_objective(problem_type): Arguments: problem_type (str or ProblemType): problem type of interest. - Returns: + Returns + ------- ObjectiveBase: primary objective instance for the problem type. """ problem_type = handle_problem_types(problem_type) diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 9913e31de6..bcc2bed150 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -1,4 +1,5 @@ -"""Data check that checks if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. +""" +Data check that checks if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. """ @@ -12,7 +13,8 @@ class ClassImbalanceDataCheck(DataCheck): - """Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. + """ + Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. Arguments: --------- @@ -23,6 +25,7 @@ class ClassImbalanceDataCheck(DataCheck): min_samples (int): The minimum number of samples per accepted class. If the minority class is both below the threshold and min_samples, then we consider this severely imbalanced. Must be greater than 0. Defaults to 100. num_cv_folds (int): The number of cross-validation folds. Must be positive. Choose 0 to ignore this warning. Defaults to 3. + """ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): @@ -47,15 +50,16 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): self.cv_folds = num_cv_folds * 2 def validate(self, X, y): - """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. + """ + Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. Arguments: --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target labels to check for imbalanced data. - Return: - ------ + Returns + ------- dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. @@ -81,6 +85,7 @@ def validate(self, X, y): ... "code": "CLASS_IMBALANCE_SEVERE", ... "details": {"target_values": [0]}}], ... "actions": []} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 4e34f19ad0..2f8443bed4 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -2,7 +2,8 @@ class DataCheckAction: - """Recommended action returned by a DataCheck. + """ + Recommended action returned by a DataCheck. Arguments: --------- @@ -16,10 +17,11 @@ def __init__(self, action_code, metadata=None): self.metadata = metadata or {} def __eq__(self, other): - """Check for equality. + """ + Check for equality. + + Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent. - Two DataCheckAction objs are considered equivalent if all of - their attributes are equivalent. """ return self.action_code == other.action_code and self.metadata == other.metadata diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 59243c7fb9..9235f0cc7f 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -1,14 +1,18 @@ +"""Messages returned by a DataCheck, tagged by name.""" from .data_check_message_type import DataCheckMessageType class DataCheckMessage: - """Base class for a message returned by a DataCheck, tagged by name. + """ + Base class for a message returned by a DataCheck, tagged by name. Arguments: - message (str): Message string - data_check_name (str): Name of data check + --------- + message (str): Message string. + data_check_name (str): Name of data check. message_code (DataCheckMessageCode): Message code associated with message. Defaults to None. details (dict): Additional useful information associated with the message. Defaults to None. + """ message_type = None @@ -20,11 +24,12 @@ def __init__(self, message, data_check_name, message_code=None, details=None): self.details = details def __str__(self): - """String representation of data check message, equivalent to self.message attribute.""" + """Return a string representation of data check message, equivalent to self.message attribute.""" return self.message def __eq__(self, other): - """Checks for equality. + """ + Check for equality. Two DataCheckMessage objs are considered equivalent if all of their attributes are equivalent. @@ -38,6 +43,7 @@ def __eq__(self, other): ) def to_dict(self): + """Return a dictionary form of the data check message.""" message_dict = { "message": self.message, "data_check_name": self.data_check_name, diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index c2cbe0c4fc..b4bd41d69c 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -17,12 +17,14 @@ def _has_defaults_for_all_args(init): class DataChecks: - """A collection of data checks. + """ + A collection of data checks. Arguments: --------- data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. + """ @staticmethod @@ -86,16 +88,18 @@ def __init__(self, data_checks=None, data_check_params=None): self.data_checks = data_check_instances def validate(self, X, y=None): - """Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. + """ + Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. Arguments: --------- X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] - Return: - ------ + Returns + ------- dict: Dictionary containing DataCheckMessage objects + """ messages = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index d3e6305030..6ae04f207d 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -15,6 +15,7 @@ def validate(self, X, y=None): y (pd.Series, np.ndarray): Ignored. Defaults to None. Returns: + ------- dict: dict with a DataCheckError if NaN values are present in datetime columns. Example: @@ -32,6 +33,7 @@ def validate(self, X, y=None): ... data_check_name=DateTimeNaNDataCheck.name, ... message_code=DataCheckMessageCode.DATETIME_HAS_NAN, ... details={"columns": 'index'}).to_dict()]} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 2b4114708a..228f34ac53 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -19,7 +19,8 @@ class DefaultDataChecks(DataChecks): - """A collection of basic data checks that is used by AutoML by default. + """ + A collection of basic data checks that is used by AutoML by default. Includes: @@ -42,6 +43,7 @@ class DefaultDataChecks(DataChecks): n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. datetime_column (str): The name of the column containing datetime information to be used for time series problems. Default to "index" indicating that the datetime information is in the index of X or y. + """ _DEFAULT_DATA_CHECK_CLASSES = [ diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 6a1dc066d1..480e1ea13c 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -45,8 +45,8 @@ def validate(self, X, y=None): X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. - Return: - ------ + Returns + ------- dict: dict with a DataCheckWarning if there are any highly-null columns or rows. Example: @@ -81,7 +81,7 @@ def validate(self, X, y=None): ... "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}], ... "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}}, ... {"code": "DROP_COL", "metadata": {"column": "lots_of_null"}}]} - + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index 41b29cdb23..0b8e1776c5 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -10,11 +10,13 @@ class IDColumnsDataCheck(DataCheck): - """Check if any of the features are likely to be ID columns. + """ + Check if any of the features are likely to be ID columns. Arguments: --------- id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0. + """ def __init__(self, id_threshold=1.0): @@ -23,7 +25,8 @@ def __init__(self, id_threshold=1.0): self.id_threshold = id_threshold def validate(self, X, y=None): - """Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. + """ + Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. - column name is "id" - column name ends in "_id" @@ -34,8 +37,8 @@ def validate(self, X, y=None): X (pd.DataFrame, np.ndarray): The input features to check.T y (pd.Series): The target. Defaults to None. Ignored. - Return: - ------ + Returns + ------- dict: A dictionary of features with column name or index and their probability of being ID columns Example: @@ -56,6 +59,7 @@ def validate(self, X, y=None): ... "details": {"column": "df_id"}}], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "df_id"}}]} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 47a706b631..d41388e603 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -46,10 +46,12 @@ def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for invalid values. - Returns: + Returns + ------- dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 5e80e8cc61..ce86d59155 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -22,10 +22,12 @@ def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. Arguments: + --------- X (pd.DataFrame): The input features to check. y (pd.Series): The target. Ignored. - Returns: + Returns + ------- dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. Example: diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 48ff277b27..1603f5ced0 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -17,8 +17,8 @@ def validate(self, X, y=None): X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Return: - ------ + Returns + ------- dict: dict with a DataCheckError if NaN values are present in natural language columns. Example: diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 08bab62578..c9d5ee93cf 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -39,8 +39,8 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. - Return: - ------ + Returns + ------- DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. """ @@ -73,8 +73,8 @@ def validate(self, X, y): X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. - Return: - ------ + Returns + ------- dict: dict of warnings/errors corresponding to features or target with no variance. """ diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 92ec257813..6fb78c4ba6 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -10,7 +10,8 @@ class OutliersDataCheck(DataCheck): - """Checks if there are any outliers in input data by using IQR to determine score anomalies. + """ + Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers. """ @@ -19,13 +20,16 @@ def validate(self, X, y=None): """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features y (pd.Series, np.ndarray): Ignored. - Returns: + Return: + ------ dict: A dictionary with warnings if any columns have outliers. Example: + ------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5], diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index 078b78337d..b2f51e6555 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -13,7 +13,8 @@ class SparsityDataCheck(DataCheck): - """Check if there are any columns with sparsely populated values in the input. + """ + Check if there are any columns with sparsely populated values in the input. Arguments: --------- @@ -24,6 +25,7 @@ class SparsityDataCheck(DataCheck): unique_count_threshold (int): The minimum number of times a unique value has to be present in a column to not be considered "sparse." Defaults to 10. + """ def __init__(self, problem_type, threshold, unique_count_threshold=10): @@ -38,15 +40,16 @@ def __init__(self, problem_type, threshold, unique_count_threshold=10): raise ValueError("Unique count threshold must be positive integer.") def validate(self, X, y=None): - """Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. + """ + Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. - Return: - ------ + Returns + ------- dict: dict with a DataCheckWarning if there are any sparse columns. Example: @@ -66,6 +69,7 @@ def validate(self, X, y=None): ... "details": {"column": "sparse", 'sparsity_score': 0.0}}], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "sparse"}}]} + """ results = {"warnings": [], "errors": [], "actions": []} @@ -100,7 +104,8 @@ def validate(self, X, y=None): @staticmethod def sparsity_score(col, count_threshold=10): - """Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. + """ + Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. Arguments: --------- @@ -108,9 +113,10 @@ def sparsity_score(col, count_threshold=10): count_threshold (int): The number of instances below which a value is considered sparse. Default is 10. - Return: - ------ + Returns + ------- (float): Sparsity score, or the percentage of the unique values that exceed count_threshold. + """ counts = col.value_counts() score = sum(counts > count_threshold) / counts.size diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index aca00f324d..d0ab384ab3 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" import numpy as np import woodwork as ww from scipy.stats import shapiro @@ -14,19 +15,23 @@ class TargetDistributionDataCheck(DataCheck): - """Checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" + """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" def validate(self, X, y): - """Checks if the target data has a certain distribution. + """ + Check if the target data has a certain distribution. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. - Returns: + Returns + ------- dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. Example: + ------- >>> from scipy.stats import lognorm >>> X = None >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] @@ -39,6 +44,7 @@ def validate(self, X, y): ... "code": "TARGET_LOGNORMAL_DISTRIBUTION", ... "details": {"shapiro-statistic/pvalue": '0.84/0.045'}}], ... "actions": [{'code': 'TRANSFORM_TARGET', 'metadata': {'column': None, 'transformation_strategy': 'lognormal', 'is_target': True}}]} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 48202d19e4..24cac97279 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -1,3 +1,5 @@ +"""Data check that checks if any of the features are highly correlated with the target by using mutual information or Pearson correlation.""" + import pandas as pd from evalml.data_checks import ( @@ -14,15 +16,18 @@ class TargetLeakageDataCheck(DataCheck): - """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. + """ + Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, this data check uses mutual information and supports all target and feature types. Otherwise, if `method='pearson'`, it uses Pearson correlation and only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: + --------- pct_corr_threshold (float): The correlation threshold to be considered leakage. Defaults to 0.95. method (string): The method to determine correlation. Use 'mutual' for mutual information, otherwise 'pearson' for Pearson correlation. Defaults to 'mutual'. + """ def __init__(self, pct_corr_threshold=0.95, method="mutual"): @@ -65,19 +70,23 @@ def _calculate_mutual_information(self, X, y): return highly_corr_cols def validate(self, X, y): - """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. + """ + Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: + --------- X (pd.DataFrame, np.ndarray): The input features to check y (pd.Series, np.ndarray): The target data - Returns: + Returns + ------- dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: + ------- >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], @@ -95,6 +104,7 @@ def validate(self, X, y): ... "errors": [], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "leak"}}]} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 304f937762..87169ff7bd 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -124,14 +124,18 @@ def validate(self, X, y=None): @staticmethod def uniqueness_score(col): - """This function calculates a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. + """ + This function calculates a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl–Hirschman Index. Arguments: + --------- col (pd.Series): Feature values. - Returns: + Returns + ------- (float): Uniqueness score. + """ norm_counts = col.value_counts() / col.value_counts().sum() square_counts = norm_counts ** 2 diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index b338baac2c..bebb47d5f3 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -3,11 +3,13 @@ class EmptyDataChecks(DataChecks): - """An empty collection of data checks. + """ + An empty collection of data checks. Arguments: --------- data_checks (list (DataCheck)): Ignored. + """ def __init__(self, data_checks=None): diff --git a/evalml/demos/breast_cancer.py b/evalml/demos/breast_cancer.py index 964af99c43..47aa014701 100644 --- a/evalml/demos/breast_cancer.py +++ b/evalml/demos/breast_cancer.py @@ -8,7 +8,7 @@ def load_breast_cancer(): """Load breast cancer dataset. Binary classification problem. - Return: + Returns ------ (pd.Dataframe, pd.Series): X and y """ diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index 9756666c1f..0db659045f 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -11,7 +11,7 @@ def load_churn(n_rows=None, verbose=True): n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Return: + Returns ------ (pd.Dataframe, pd.Series): X and y """ diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index a5b47dbbd5..f1eabcc970 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -9,7 +9,7 @@ def load_diabetes(): """Load diabetes dataset. Used for regression problem. - Return: + Returns ------ (pd.Dataframe, pd.Series): X and y """ diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index 07771f16ff..0571624b02 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -13,7 +13,7 @@ def load_fraud(n_rows=None, verbose=True): n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Return: + Returns ------ (pd.Dataframe, pd.Series): X and y """ diff --git a/evalml/demos/wine.py b/evalml/demos/wine.py index 896776ec05..b06f1a5783 100644 --- a/evalml/demos/wine.py +++ b/evalml/demos/wine.py @@ -8,7 +8,7 @@ def load_wine(): """Load wine dataset. Multiclass problem. - Return: + Returns ------ (pd.Dataframe, pd.Series): X and y """ diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 677c3193d2..9fbcb25105 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -51,7 +51,7 @@ def split(self, X, y=None, groups=None): y (pd.DataFrame, None): Target variable to split. groups: Ignored but kept for compatibility with sklearn API. - Return: + Returns ------ Iterator of (train, test) indices tuples. """ From 0cec01d68f3631828d81caafd6d3a6d6fa2d80ba Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 12:13:04 -0400 Subject: [PATCH 07/62] sphinx dont tree warnings as errors, need to remove later but using to test changes in doc api ref --- docs/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index be36fab7e0..a8d23876a0 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,6 @@ # # You can set these variables from the command line. -SPHINXOPTS = -W SPHINXBUILD = sphinx-build SOURCEDIR = source GENDIR = source/generated @@ -18,18 +17,18 @@ clean: .PHONY: html html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html $(SPHINXOPTS) -j 'auto' + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html -j 'auto' @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." # Put it first so that "make" without argument is like "make help". help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(O) From 080184009d9928e54f0d5f12e57d148e3b35dc3f Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 15:25:02 -0400 Subject: [PATCH 08/62] try update make bat --- docs/make.bat | 5 +- evalml/data_checks/datetime_nan_data_check.py | 3 +- .../multicollinearity_data_check.py | 2 + .../balanced_classification_sampler.py | 31 ++++++++---- .../data_splitters/sampler_base.py | 12 ++++- .../data_splitters/time_series_split.py | 12 +++-- .../training_validation_split.py | 6 ++- evalml/preprocessing/utils.py | 50 ++++++++++++------- 8 files changed, 82 insertions(+), 39 deletions(-) diff --git a/docs/make.bat b/docs/make.bat index 4d9eb83d9f..fa6005a60c 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -25,11 +25,10 @@ if errorlevel 9009 ( exit /b 1 ) -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% goto end :help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% :end popd diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index 6ae04f207d..2f2377eb2f 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -11,10 +11,11 @@ def validate(self, X, y=None): """Checks if any datetime columns contain NaN values. Arguments: + --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns: + Returns ------- dict: dict with a DataCheckError if NaN values are present in datetime columns. diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index ce86d59155..282d3f1663 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -1,3 +1,4 @@ +"""Data check to check if any set features are likely to be multicollinear.""" from evalml.data_checks import ( DataCheck, DataCheckMessageCode, @@ -10,6 +11,7 @@ class MulticollinearityDataCheck(DataCheck): """Check if any set features are likely to be multicollinear. Arguments: + --------- threshold (float): The threshold to be considered. Defaults to 0.9. """ diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py index 0250b05e3c..bd89df7592 100644 --- a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py +++ b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py @@ -1,3 +1,4 @@ +"""Class for balanced classification downsampler.""" import numpy as np from evalml.preprocessing.data_splitters.sampler_base import SamplerBase @@ -5,9 +6,11 @@ class BalancedClassificationSampler(SamplerBase): - """Class for balanced classification downsampler. + """ + Class for balanced classification downsampler. Arguments: + --------- sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. sampling_ratio_dict (dict): A dictionary specifying the desired balanced ratio for each target value. Overrides sampling_ratio if provided. @@ -19,6 +22,7 @@ class BalancedClassificationSampler(SamplerBase): To determine severe imbalance, the minority class must have a class ratio below this and must occur less often than min_samples. Must be between 0 and 0.5, inclusive. Defaults to 0.1. random_seed (int): The seed to use for random sampling. Defaults to 0. + """ def __init__( @@ -49,12 +53,15 @@ def __init__( self.sampling_ratio_dict = sampling_ratio_dict or {} def _find_ideal_samples(self, y): - """Returns dictionary of examples to drop for each class if we need to resample. + """ + Return dictionary of examples to drop for each class if we need to resample. Arguments: - y (pd.Series): Target data passed in + --------- + y (pd.Series): Target data passed in. Returns: + ------- (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """ @@ -83,12 +90,15 @@ def _find_ideal_samples(self, y): return {k: v for k, v in drop_values.items() if v > 0} def _sampling_dict_to_remove_dict(self, y): - """Turns the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. + """ + Turn the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. Arguments: - y (pd.Series): Training data targets + --------- + y (pd.Series): Training data targets. Returns: + ------- (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """ @@ -99,14 +109,17 @@ def _sampling_dict_to_remove_dict(self, y): return new_dic def fit_resample(self, X, y): - """Resampling technique for this sampler. + """ + Resampling technique for this sampler. Arguments: - X (pd.DataFrame): Training data to fit and resample - y (pd.Series): Training data targets to fit and resample + --------- + X (pd.DataFrame): Training data to fit and resample. + y (pd.Series): Training data targets to fit and resample. Returns: - list: Indices to keep for training data + ------- + list: Indices to keep for training data. """ y = infer_feature_types(y) diff --git a/evalml/preprocessing/data_splitters/sampler_base.py b/evalml/preprocessing/data_splitters/sampler_base.py index ee888c2673..780b3dc1ac 100644 --- a/evalml/preprocessing/data_splitters/sampler_base.py +++ b/evalml/preprocessing/data_splitters/sampler_base.py @@ -1,11 +1,15 @@ +"""Base class for all custom samplers.""" from abc import ABC, abstractmethod class SamplerBase(ABC): - """Base class for all custom samplers. + """ + Base class for all custom samplers. Arguments: + --------- random_seed (int): The seed to use for random sampling. Defaults to 0. + """ def __init__(self, random_seed=0): @@ -13,12 +17,16 @@ def __init__(self, random_seed=0): @abstractmethod def fit_resample(self, X, y): - """Resample the input data with this sampling strategy. + """ + Resample the input data with this sampling strategy. Arguments: + --------- X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. Returns: + ------- Tuple(pd.DataFrame, pd.Series) or list: resampled X and y data for oversampling or indices to keep for undersampling. + """ diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 9fbcb25105..a57db23dea 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -5,7 +5,8 @@ class TimeSeriesSplit(BaseCrossValidator): - """Rolling Origin Cross Validation for time series problems. + """ + Rolling Origin Cross Validation for time series problems. This class uses max_delay and gap values to take into account that evalml time series pipelines perform some feature and target engineering, e.g delaying input features and shifting the target variable by the @@ -39,7 +40,8 @@ def _check_if_empty(data): return data is None or data.empty def split(self, X, y=None, groups=None): - """Get the time series splits. + """ + Get the time series splits. X and y are assumed to be sorted in ascending time order. This method can handle passing in empty or None X and y data but note that X and y cannot be None or empty @@ -48,11 +50,11 @@ def split(self, X, y=None, groups=None): Arguments: --------- X (pd.DataFrame, None): Features to split. - y (pd.DataFrame, None): Target variable to split. - groups: Ignored but kept for compatibility with sklearn API. + y (pd.DataFrame, None): Target variable to split. Defaults to None. + groups: Ignored but kept for compatibility with sklearn API. Defaults to None. Returns - ------ + ------- Iterator of (train, test) indices tuples. """ # Sklearn splitters always assume a valid X is passed but we need to support the diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index 92225de76e..dfded94aa4 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -33,17 +33,19 @@ def __init__( @staticmethod def get_n_splits(): - """Returns the number of splits of this object.""" + """Return the number of splits of this object.""" return 1 def split(self, X, y=None): - """Divides the data into training and testing sets. + """Divide the data into training and testing sets. Arguments: + --------- X (pd.DataFrame): Dataframe of points to split y (pd.Series): Series of points to split Returns: + ------- list: Indices to split data into training and test set """ train, test = train_test_split( diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index b5121467f1..e4546c904f 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -11,18 +11,22 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwargs): - """Load features and target from file. + """ + Load features and target from file. Arguments: - path (str): Path to file or a http/ftp/s3 URL - index (str): Column for index - target (str): Column for target - n_rows (int): Number of rows to return - drop (list): List of columns to drop - verbose (bool): If True, prints information about features and target + --------- + path (str): Path to file or a http/ftp/s3 URL. + index (str): Column for index. + target (str): Column for target. + n_rows (int): Number of rows to return. Defaults to None. + drop (list): List of columns to drop. Defaults to None. + verbose (bool): If True, prints information about features and target. Defaults to True. Returns: - pd.DataFrame, pd.Series: Features matrix and target + ------- + pd.DataFrame, pd.Series: Features matrix and target. + """ feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs) @@ -51,6 +55,7 @@ def split_data( """Splits data into train and test sets. Arguments: + --------- X (pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. @@ -60,7 +65,8 @@ def split_data( random_seed (int): Seed for the random number generator. Defaults to 0. Returns: - pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets + pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. + """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -90,13 +96,17 @@ def split_data( def number_of_features(dtypes): - """Get the number of features of each specific dtype in a DataFrame. + """ + Get the number of features of each specific dtype in a DataFrame. Arguments: - dtypes (pd.Series): DataFrame.dtypes to get the number of features for + --------- + dtypes (pd.Series): DataFrame.dtypes to get the number of features for. Returns: - pd.Series: dtypes and the number of features for each input type + ------- + pd.Series: dtypes and the number of features for each input type. + """ dtype_to_vtype = { "bool": "Boolean", @@ -112,12 +122,15 @@ def number_of_features(dtypes): def target_distribution(targets): - """Get the target distributions. + """ + Get the target distributions. Arguments: - targets (pd.Series): Target data + --------- + targets (pd.Series): Target data. Returns: + ------- pd.Series: Target data and their frequency distribution as percentages. """ distribution = targets.value_counts() / len(targets) @@ -125,13 +138,16 @@ def target_distribution(targets): def drop_nan_target_rows(X, y): - """Drops rows in X and y when row in the target y has a value of NaN. + """ + Drops rows in X and y when row in the target y has a value of NaN. Arguments: - X (pd.DataFrame, np.ndarray): Data to transform - y (pd.Series, np.ndarray): Target data + --------- + X (pd.DataFrame, np.ndarray): Data to transform. + y (pd.Series, np.ndarray): Target data. Returns: + ------- pd.DataFrame, pd.DataFrame: Transformed X (and y, if passed in) with rows that had a NaN value removed. """ X_t = X From 8c646c8ebdab629df646803b8e0cf6662986022d Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 27 Aug 2021 15:26:30 -0400 Subject: [PATCH 09/62] try to fix class imbal dc as example --- evalml/data_checks/class_imbalance_data_check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index bcc2bed150..76ff6765b7 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -16,7 +16,7 @@ class ClassImbalanceDataCheck(DataCheck): """ Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. - Arguments: + Arguments --------- threshold (float): The minimum threshold allowed for class imbalance before a warning is raised. This threshold is calculated by comparing the number of samples in each class to the sum of samples in that class and the majority class. @@ -53,7 +53,7 @@ def validate(self, X, y): """ Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. - Arguments: + Arguments --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target labels to check for imbalanced data. @@ -63,7 +63,7 @@ def validate(self, X, y): dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. - Example: + Example ------- >>> import pandas as pd >>> X = pd.DataFrame() From c6a0e54d1ed9efd897444ebb23e3d5885d159736 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sun, 29 Aug 2021 19:52:53 -0400 Subject: [PATCH 10/62] empty commit From 2bdbb99d26cdb9fefc0870ab128fb3a12db52dcc Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sun, 29 Aug 2021 21:51:44 -0400 Subject: [PATCH 11/62] change to parameters --- .../automl_algorithm/automl_algorithm.py | 4 +- .../automl_algorithm/evalml_algorithm.py | 4 +- .../automl_algorithm/iterative_algorithm.py | 6 +-- evalml/automl/automl_search.py | 20 ++++----- evalml/automl/engine/cf_engine.py | 10 ++--- evalml/automl/engine/dask_engine.py | 10 ++--- evalml/automl/engine/engine_base.py | 8 ++-- evalml/automl/engine/sequential_engine.py | 2 +- evalml/automl/pipeline_search_plots.py | 2 +- evalml/automl/utils.py | 12 ++--- evalml/data_checks/__init__.py | 1 + evalml/data_checks/data_check.py | 13 ++++-- evalml/data_checks/data_check_action.py | 2 +- evalml/data_checks/data_check_message.py | 2 +- evalml/data_checks/data_check_message_code.py | 1 + evalml/data_checks/data_check_message_type.py | 1 + evalml/data_checks/data_checks.py | 4 +- .../data_checks/datetime_format_data_check.py | 18 +++++--- evalml/data_checks/datetime_nan_data_check.py | 11 +++-- evalml/data_checks/default_data_checks.py | 2 +- evalml/data_checks/highly_null_data_check.py | 4 +- evalml/data_checks/id_columns_data_check.py | 4 +- .../data_checks/invalid_targets_data_check.py | 16 ++++--- .../multicollinearity_data_check.py | 4 +- .../natural_language_nan_data_check.py | 2 +- evalml/data_checks/no_variance_data_check.py | 6 +-- evalml/data_checks/outliers_data_check.py | 28 +++++++----- evalml/data_checks/sparsity_data_check.py | 6 +-- .../target_distribution_data_check.py | 2 +- .../data_checks/target_leakage_data_check.py | 4 +- evalml/data_checks/uniqueness_data_check.py | 22 ++++++--- evalml/data_checks/utils.py | 2 +- evalml/demos/churn.py | 2 +- evalml/demos/fraud.py | 2 +- evalml/exceptions/exceptions.py | 2 +- evalml/model_family/utils.py | 2 +- evalml/model_understanding/graphs.py | 44 +++++++++--------- .../permutation_importance.py | 4 +- .../prediction_explanations/_algorithms.py | 12 ++--- .../_report_creator_factory.py | 2 +- .../_user_interface.py | 22 ++++----- .../prediction_explanations/explainers.py | 8 ++-- .../binary_classification_objective.py | 4 +- evalml/objectives/cost_benefit_matrix.py | 4 +- evalml/objectives/fraud_cost.py | 4 +- evalml/objectives/lead_scoring.py | 4 +- evalml/objectives/objective_base.py | 10 ++--- evalml/objectives/sensitivity_low_alert.py | 6 +-- evalml/objectives/utils.py | 4 +- .../binary_classification_pipeline.py | 6 +-- .../binary_classification_pipeline_mixin.py | 2 +- evalml/pipelines/classification_pipeline.py | 12 ++--- evalml/pipelines/component_graph.py | 30 ++++++------- evalml/pipelines/components/component_base.py | 12 ++--- .../ensemble/sklearn_stacked_ensemble_base.py | 2 +- .../sklearn_stacked_ensemble_classifier.py | 2 +- .../sklearn_stacked_ensemble_regressor.py | 2 +- .../classifiers/baseline_classifier.py | 2 +- .../classifiers/catboost_classifier.py | 2 +- .../classifiers/decision_tree_classifier.py | 2 +- .../classifiers/elasticnet_classifier.py | 2 +- .../estimators/classifiers/et_classifier.py | 2 +- .../classifiers/kneighbors_classifier.py | 2 +- .../classifiers/lightgbm_classifier.py | 2 +- .../logistic_regression_classifier.py | 2 +- .../estimators/classifiers/rf_classifier.py | 2 +- .../estimators/classifiers/svm_classifier.py | 2 +- .../classifiers/xgboost_classifier.py | 2 +- .../components/estimators/estimator.py | 6 +-- .../estimators/regressors/arima_regressor.py | 2 +- .../regressors/baseline_regressor.py | 2 +- .../regressors/catboost_regressor.py | 2 +- .../regressors/decision_tree_regressor.py | 2 +- .../regressors/elasticnet_regressor.py | 2 +- .../estimators/regressors/et_regressor.py | 2 +- .../regressors/lightgbm_regressor.py | 2 +- .../estimators/regressors/linear_regressor.py | 2 +- .../estimators/regressors/rf_regressor.py | 2 +- .../estimators/regressors/svm_regressor.py | 2 +- .../time_series_baseline_estimator.py | 2 +- .../regressors/xgboost_regressor.py | 2 +- .../transformers/column_selectors.py | 14 +++--- .../dimensionality_reduction/lda.py | 2 +- .../dimensionality_reduction/pca.py | 2 +- .../transformers/encoders/onehot_encoder.py | 6 +-- .../transformers/encoders/target_encoder.py | 2 +- .../feature_selection/feature_selector.py | 4 +- .../rf_classifier_feature_selector.py | 2 +- .../rf_regressor_feature_selector.py | 2 +- .../transformers/imputers/imputer.py | 6 +-- .../imputers/per_column_imputer.py | 6 +-- .../transformers/imputers/simple_imputer.py | 8 ++-- .../transformers/imputers/target_imputer.py | 8 ++-- .../preprocessing/datetime_featurizer.py | 4 +- .../delayed_feature_transformer.py | 6 +-- .../preprocessing/drop_null_columns.py | 4 +- .../preprocessing/drop_rows_transformer.py | 2 +- .../preprocessing/featuretools.py | 6 +-- .../preprocessing/log_transformer.py | 6 +-- .../transformers/preprocessing/lsa.py | 4 +- .../preprocessing/polynomial_detrender.py | 10 ++--- .../preprocessing/text_featurizer.py | 6 +-- .../preprocessing/text_transformer.py | 2 +- .../transform_primitive_components.py | 4 +- .../transformers/samplers/base_sampler.py | 18 ++++---- .../transformers/samplers/oversamplers.py | 6 +-- .../transformers/samplers/undersampler.py | 4 +- .../transformers/scalers/standard_scaler.py | 2 +- .../components/transformers/transformer.py | 8 ++-- evalml/pipelines/components/utils.py | 24 +++++----- .../multiclass_classification_pipeline.py | 2 +- evalml/pipelines/pipeline_base.py | 34 +++++++------- evalml/pipelines/regression_pipeline.py | 6 +-- .../time_series_classification_pipelines.py | 14 +++--- evalml/pipelines/time_series_pipeline_base.py | 4 +- .../time_series_regression_pipeline.py | 6 +-- evalml/pipelines/utils.py | 45 ++++++++++--------- evalml/preprocessing/__init__.py | 1 + .../preprocessing/data_splitters/__init__.py | 1 + .../balanced_classification_sampler.py | 14 +++--- .../data_splitters/sampler_base.py | 6 +-- .../data_splitters/time_series_split.py | 4 +- .../training_validation_split.py | 8 ++-- evalml/preprocessing/utils.py | 28 ++++++------ evalml/problem_types/utils.py | 14 +++--- evalml/tests/conftest.py | 6 +-- evalml/tuners/grid_search_tuner.py | 4 +- evalml/tuners/random_search_tuner.py | 4 +- evalml/tuners/skopt_tuner.py | 4 +- evalml/tuners/tuner.py | 4 +- evalml/utils/gen_utils.py | 30 ++++++------- evalml/utils/logger.py | 2 +- evalml/utils/woodwork_utils.py | 6 +-- requirements.txt | 2 +- 134 files changed, 480 insertions(+), 436 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 77919f7ac6..87eb3bd112 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -17,7 +17,7 @@ class AutoMLAlgorithm(ABC): To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary. - Arguments: + Parameters allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. custom_hyperparameters (dict): Custom hyperparameter ranges specified for pipelines to iterate over. max_iterations (int): The maximum number of iterations to be evaluated. @@ -59,7 +59,7 @@ def next_batch(self): def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. - Arguments: + Parameters score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_algorithm/evalml_algorithm.py b/evalml/automl/automl_algorithm/evalml_algorithm.py index da4f552518..4fe6a2b86f 100644 --- a/evalml/automl/automl_algorithm/evalml_algorithm.py +++ b/evalml/automl/automl_algorithm/evalml_algorithm.py @@ -67,7 +67,7 @@ def __init__( num_long_pipelines_per_batch=10, ): """ - Arguments: + Parameters X (pd.DataFrame): Training data y (pd.Series): Target data problem_type (ProblemType): Problem type associated with training data @@ -295,7 +295,7 @@ def next_batch(self): def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. In batch number 2, the selected column names from the feature selector are taken to be used in a column selector. Information regarding the best pipeline is updated here as well. - Arguments: + Parameters score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index b80a9f443e..0043ea7335 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -24,7 +24,7 @@ class IterativeAlgorithm(AutoMLAlgorithm): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. - Arguments: + Parameters allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. @@ -56,7 +56,7 @@ def __init__( ): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. - Arguments: + Parameters allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. @@ -198,7 +198,7 @@ def next_batch(self): def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. - Arguments: + Parameters score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 30275f5958..3554546b6d 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -81,7 +81,7 @@ def search( This method is provided for convenience. If you'd like more control over when each of these steps is run, consider making calls directly to the various pieces like the data checks and AutoMLSearch, instead of using this method. - Arguments: + Parameters X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. @@ -152,7 +152,7 @@ def search( class AutoMLSearch: """Automated Pipeline search. - Arguments: + Parameters X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. @@ -784,7 +784,7 @@ def _handle_keyboard_interrupt(self): def search(self, show_iteration_plot=True): """Find the best pipeline for the data set. - Arguments: + Parameters feature_types (list, optional): list of feature types, either numerical or categorical. Categorical features will automatically be encoded @@ -1199,7 +1199,7 @@ def _check_for_high_variance(self, pipeline, cv_scores, threshold=0.5): def get_pipeline(self, pipeline_id): """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline initialized with the parameters used to train that pipeline during automl search. - Arguments: + Parameters pipeline_id (int): pipeline to retrieve Returns: @@ -1219,7 +1219,7 @@ def get_pipeline(self, pipeline_id): def describe_pipeline(self, pipeline_id, return_dict=False): """Describe a pipeline. - Arguments: + Parameters pipeline_id (int): pipeline to describe return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. @@ -1295,7 +1295,7 @@ def describe_pipeline(self, pipeline_id, return_dict=False): def add_to_rankings(self, pipeline): """Fits and evaluates a given pipeline then adds the results to the automl rankings with the requirement that automl search has been run. - Arguments: + Parameters pipeline (PipelineBase): pipeline to train and evaluate. """ pipeline_rows = self.full_rankings[ @@ -1388,7 +1388,7 @@ def save( ): """Saves AutoML object at file path. - Arguments: + Parameters file_path (str): location to save file pickle_type {"pickle", "cloudpickle"}: the pickling library to use. pickle_protocol (int): the pickle data stream format. @@ -1415,7 +1415,7 @@ def load( ): """Loads AutoML object at file path. - Arguments: + Parameters file_path (str): location to find file to load pickle_type {"pickle", "cloudpickle"}: the pickling library to use. Currently not used since the standard pickle library can handle cloudpickles. @@ -1430,7 +1430,7 @@ def train_pipelines(self, pipelines): This can be helpful for training pipelines once the search is complete. - Arguments: + Parameters pipelines (list(PipelineBase)): List of pipelines to train. Returns: @@ -1470,7 +1470,7 @@ def train_pipelines(self, pipelines): def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): """Score a list of pipelines on the given holdout data. - Arguments: + Parameters pipelines (list(PipelineBase)): List of pipelines to train. X_holdout (pd.DataFrame): Holdout features. y_holdout (pd.Series): Holdout targets for scoring. diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index bb84fe9204..d74fcd0fe9 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -12,7 +12,7 @@ class CFClient: def __init__(self, pool): """ - Arguments: + Parameters pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): the resource pool to execute the futures work on. """ @@ -34,7 +34,7 @@ class CFComputation(EngineComputation): def __init__(self, future): """ - Arguments: + Parameters future(cf.Future): The concurrent.futures.Future that is desired to be executed. """ @@ -92,7 +92,7 @@ def __init__(self, client): def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send evaluation job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to evaluate X (pd.DataFrame): input data for modeling @@ -115,7 +115,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send training job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling @@ -134,7 +134,7 @@ def submit_scoring_job( ) -> EngineComputation: """Send scoring job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index 6a61b6fd16..64198d2a39 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -13,7 +13,7 @@ class DaskComputation(EngineComputation): """A Future-like wrapper around jobs created by the DaskEngine. - Arguments: + Parameters dask_future (callable): Computation to do. """ @@ -66,7 +66,7 @@ def send_data_to_cluster(self, X, y): The implementation uses caching so the data is only sent once. This follows dask best practices. - Arguments: + Parameters X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling Returns @@ -85,7 +85,7 @@ def send_data_to_cluster(self, X, y): def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send evaluation job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to evaluate X (pd.DataFrame): input data for modeling @@ -109,7 +109,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send training job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling @@ -129,7 +129,7 @@ def submit_scoring_job( ) -> EngineComputation: """Send scoring job to cluster. - Arguments: + Parameters automl_config: structure containing data passed from AutoMLSearch instance pipeline (pipeline.PipelineBase): pipeline to train X (pd.DataFrame): input data for modeling diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index daabc91353..7ef97054af 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -98,7 +98,7 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): def train_pipeline(pipeline, X, y, automl_config, schema=True): """Train a pipeline and tune the threshold if necessary. - Arguments: + Parameters pipeline (PipelineBase): Pipeline to train. X (pd.DataFrame): Features to train on. y (pd.Series): Target to train on. @@ -147,7 +147,7 @@ def train_and_score_pipeline( ): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores. - Arguments: + Parameters pipeline (PipelineBase): The pipeline to score automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback full_X_train (pd.DataFrame): Training features @@ -290,7 +290,7 @@ def train_and_score_pipeline( def evaluate_pipeline(pipeline, automl_config, X, y, logger): """Function submitted to the submit_evaluation_job engine method. - Arguments: + Parameters pipeline (PipelineBase): The pipeline to score automl_config (AutoMLConfig): The AutoMLSearch object, used to access config and the error callback X (pd.DataFrame): Training features @@ -317,7 +317,7 @@ def evaluate_pipeline(pipeline, automl_config, X, y, logger): def score_pipeline(pipeline, X, y, objectives, X_schema=None, y_schema=None): """Wrapper around pipeline.score method to make it easy to score pipelines with dask. - Arguments: + Parameters pipeline (PipelineBase): The pipeline to score. X (pd.DataFrame): Features to score on. y (pd.Series): Target used to calcualte scores. diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index acabde2204..30461a8f01 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -17,7 +17,7 @@ class SequentialComputation(EngineComputation): computation is "done", by always returning True in done() we make sure that get_result is called in the order that the jobs are submitted. So the computations happen sequentially! - Arguments: + Parameters work (callable): Computation that should be done by the engine. """ diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index 6d1589c867..cd2805eb68 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -79,7 +79,7 @@ class PipelineSearchPlots: def __init__(self, results, objective): """Make plots for the AutoMLSearch class. - Arguments: + Parameters data (AutoMLSearch): Automated pipeline search object """ self._go = import_or_raise( diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 2c5b74adae..93b27eb8a4 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -32,7 +32,7 @@ def get_default_primary_search_objective(problem_type): """Get the default primary search objective for a problem type. - Arguments: + Parameters problem_type (str or ProblemType): problem type of interest. Returns @@ -62,7 +62,7 @@ def make_data_splitter( ): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. - Arguments: + Parameters X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. problem_type (ProblemType): The type of machine learning problem. @@ -105,7 +105,7 @@ def tune_binary_threshold( ): """Tunes the threshold of a binary pipeline to the X and y thresholding data. - Arguments: + Parameters pipeline (Pipeline): Pipeline instance to threshold. objective (ObjectiveBase): The objective we want to tune with. If not tuneable and best_pipeline is True, will use F1. problem_type (ProblemType): The problem type of the pipeline. @@ -129,7 +129,7 @@ def tune_binary_threshold( def check_all_pipeline_names_unique(pipelines): """Checks whether all the pipeline names are unique. - Arguments: + Parameters pipelines (list(PipelineBase)): List of pipelines to check if all names are unique. Returns: @@ -169,7 +169,7 @@ def check_all_pipeline_names_unique(pipelines): def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): """Returns the name of the sampler component to use for AutoMLSearch. - Arguments: + Parameters X (pd.DataFrame): The input feature data y (pd.Series): The input target data sampler_method (str): The sampler_type argument passed to AutoMLSearch @@ -211,7 +211,7 @@ def get_pipelines_from_component_graphs( ): """Returns created pipelines from passed component graphs based on the specified problem type. - Arguments: + Parameters component_graphs_dict (dict): The dict of component graphs. problem_type (str or ProblemType): The problem type for which pipelines will be created. parameters (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. diff --git a/evalml/data_checks/__init__.py b/evalml/data_checks/__init__.py index 37269bcfac..5f5fcc8061 100644 --- a/evalml/data_checks/__init__.py +++ b/evalml/data_checks/__init__.py @@ -1,3 +1,4 @@ +"""Data checks.""" from .data_check import DataCheck from .data_check_message_code import DataCheckMessageCode from .data_check_action import DataCheckAction diff --git a/evalml/data_checks/data_check.py b/evalml/data_checks/data_check.py index fc26243337..86dfbedaa4 100644 --- a/evalml/data_checks/data_check.py +++ b/evalml/data_checks/data_check.py @@ -1,3 +1,4 @@ +"""Base class for all data checks.""" from abc import ABC, abstractmethod from evalml.data_checks.data_check_message_type import DataCheckMessageType @@ -5,7 +6,8 @@ class DataCheck(ABC): - """Base class for all data checks. + """ + Base class for all data checks. Data checks are a set of heuristics used to determine if there are problems with input data. @@ -18,13 +20,16 @@ def name(cls): @abstractmethod def validate(self, X, y=None): - """Inspects and validates the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. + """ + Inspect and validate the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. - Arguments: + Arguments + --------- X (pd.DataFrame): The input data of shape [n_samples, n_features] y (pd.Series, optional): The target data of length [n_samples] - Returns: + Returns + ------- dict (DataCheckMessage): Dictionary of DataCheckError and DataCheckWarning messages """ diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 2f8443bed4..03c80f58c0 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -5,7 +5,7 @@ class DataCheckAction: """ Recommended action returned by a DataCheck. - Arguments: + Arguments --------- action_code (DataCheckActionCode): Action code associated with the action. metadata (dict, optional): Additional useful information associated with the action. Defaults to None. diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 9235f0cc7f..1bab135d62 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -6,7 +6,7 @@ class DataCheckMessage: """ Base class for a message returned by a DataCheck, tagged by name. - Arguments: + Arguments --------- message (str): Message string. data_check_name (str): Name of data check. diff --git a/evalml/data_checks/data_check_message_code.py b/evalml/data_checks/data_check_message_code.py index ee68fb1bb5..5cbaad2330 100644 --- a/evalml/data_checks/data_check_message_code.py +++ b/evalml/data_checks/data_check_message_code.py @@ -1,3 +1,4 @@ +"""Enum for data check message code.""" from enum import Enum diff --git a/evalml/data_checks/data_check_message_type.py b/evalml/data_checks/data_check_message_type.py index d88395273f..717f6b931e 100644 --- a/evalml/data_checks/data_check_message_type.py +++ b/evalml/data_checks/data_check_message_type.py @@ -1,3 +1,4 @@ +"""Enum for type of data check message.""" from enum import Enum diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index b4bd41d69c..106ad7d992 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -20,7 +20,7 @@ class DataChecks: """ A collection of data checks. - Arguments: + Arguments --------- data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. @@ -91,7 +91,7 @@ def validate(self, X, y=None): """ Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. - Arguments: + Arguments --------- X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index effe387544..665706282b 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -1,3 +1,5 @@ +"""Data check that checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. +""" import pandas as pd from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode @@ -5,9 +7,11 @@ class DateTimeFormatDataCheck(DataCheck): - """Checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. + """ + Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. - Arguments: + Parameters + ---------- datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index". """ @@ -17,14 +21,17 @@ def __init__(self, datetime_column="index"): def validate(self, X, y): """Checks if the target data has equal intervals and is sorted. - Arguments: + Parameters + ---------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Target data. - Returns: + Returns + ------- dict (DataCheckError): List with DataCheckErrors if unequal intervals are found in the datetime column. - Example: + Example + ------- >>> from pandas as pd >>> X = pd.DataFrame(pd.date_range("January 1, 2021", periods=8), columns=["dates"]) >>> y = pd.Series([1, 2, 4, 2, 1, 2, 3, 1]) @@ -38,6 +45,7 @@ def validate(self, X, y): ... "details": {}}], ... "warnings": [], ... "actions": []} + """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index 2f2377eb2f..dbef5b4d9a 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -1,3 +1,5 @@ +"""Data check that checks each column in the input for datetime features and will issue an error if NaN values are present.""" + from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode from evalml.utils.woodwork_utils import infer_feature_types @@ -5,12 +7,12 @@ class DateTimeNaNDataCheck(DataCheck): - """Checks each column in the input for datetime features and will issue an error if NaN values are present.""" + """Check each column in the input for datetime features and will issue an error if NaN values are present.""" def validate(self, X, y=None): - """Checks if any datetime columns contain NaN values. + """Check if any datetime columns contain NaN values. - Arguments: + Arguments --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. @@ -19,7 +21,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckError if NaN values are present in datetime columns. - Example: + Example + ------- >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 228f34ac53..b71f2fc987 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -36,7 +36,7 @@ class DefaultDataChecks(DataChecks): - `TargetDistributionDataCheck` (for regression problem types) - `DateTimeFormatDataCheck` (for time series problem types) - Arguments: + Arguments --------- problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 480e1ea13c..554da323dd 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -14,7 +14,7 @@ class HighlyNullDataCheck(DataCheck): """ Check if there are any highly-null columns and rows in the input. - Arguments: + Parameters --------- pct_null_col_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, that column will be considered highly-null. Defaults to 0.95. @@ -40,7 +40,7 @@ def validate(self, X, y=None): """ Check if there are any highly-null columns or rows in the input. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index 0b8e1776c5..18db3c07f8 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -13,7 +13,7 @@ class IDColumnsDataCheck(DataCheck): """ Check if any of the features are likely to be ID columns. - Arguments: + Parameters --------- id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0. @@ -32,7 +32,7 @@ def validate(self, X, y=None): - column name ends in "_id" - column contains all unique values (and is categorical / integer type) - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): The input features to check.T y (pd.Series): The target. Defaults to None. Ignored. diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index d41388e603..4deb3be466 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if the target data contains missing or invalid values.""" import woodwork as ww from evalml.data_checks import ( @@ -23,9 +24,11 @@ class InvalidTargetDataCheck(DataCheck): - """Checks if the target data contains missing or invalid values. + """ + Check if the target data contains missing or invalid values. - Arguments: + Arguments + --------- problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' objective (str or ObjectiveBase): Name or instance of the objective class. @@ -43,10 +46,10 @@ def __init__(self, problem_type, objective, n_unique=100): self.n_unique = n_unique def validate(self, X, y): - """Checks if the target data contains missing or invalid values. + """Check if the target data contains missing or invalid values. - Arguments: - --------- + Parameters + ---------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for invalid values. @@ -54,7 +57,8 @@ def validate(self, X, y): ------- dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. - Example: + Example + ------- >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 282d3f1663..24c9f39d33 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -10,7 +10,7 @@ class MulticollinearityDataCheck(DataCheck): """Check if any set features are likely to be multicollinear. - Arguments: + Parameters --------- threshold (float): The threshold to be considered. Defaults to 0.9. """ @@ -23,7 +23,7 @@ def __init__(self, threshold=0.9): def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. - Arguments: + Parameters --------- X (pd.DataFrame): The input features to check. y (pd.Series): The target. Ignored. diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 1603f5ced0..9f0daff27f 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -12,7 +12,7 @@ def validate(self, X, y=None): """ Check if any natural language columns contain NaN values. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index c9d5ee93cf..ed7259d213 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -17,7 +17,7 @@ class NoVarianceDataCheck(DataCheck): """ Check if the target or any of the features have no variance. - Arguments: + Parameters --------- count_nan_as_value (bool): If True, missing values will be counted as their own unique value. Additionally, if true, will return a DataCheckWarning instead of an error @@ -33,7 +33,7 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): """ Check if a column has no variance. - Arguments: + Parameters --------- column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. @@ -68,7 +68,7 @@ def validate(self, X, y): """ Check if the target or any of the features have no variance (1 unique value). - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 6fb78c4ba6..9a1b7a81fc 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if there are any outliers in input data by using IQR to determine score anomalies.""" import numpy as np from scipy.stats import gamma @@ -17,18 +18,19 @@ class OutliersDataCheck(DataCheck): """ def validate(self, X, y=None): - """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. + """ + Check if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Features y (pd.Series, np.ndarray): Ignored. - Return: + Return ------ dict: A dictionary with warnings if any columns have outliers. - Example: + Example ------- >>> import pandas as pd >>> df = pd.DataFrame({ @@ -85,19 +87,21 @@ def validate(self, X, y=None): @staticmethod def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: - """This functions calculates the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. + """ + Calculate the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. Original credit goes to Jad Raad and Dan Putler of Alteryx. + Parameters + --------- + num_records (int): The integer number of non-missing values in a column. + pct_outliers (float): The percentage of potential outliers in a column. - Arguments: - num_records (int): The integer number of non-missing values in a column - pct_outliers (float): The percentage of potential outliers in a column - Returns: - float: The probability that no outliers are present in the column + Returns + ------- + float: The probability that no outliers are present in the column. """ - - # calculate the shape and scale parameters of the approximate + # Calculate the shape and scale parameters of the approximate # gamma distribution given the number of records in the data. # For both measures, the values are are from a least squares regression # model diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index b2f51e6555..899c29af20 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -16,7 +16,7 @@ class SparsityDataCheck(DataCheck): """ Check if there are any columns with sparsely populated values in the input. - Arguments: + Parameters --------- problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. @@ -43,7 +43,7 @@ def validate(self, X, y=None): """ Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. @@ -107,7 +107,7 @@ def sparsity_score(col, count_threshold=10): """ Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. - Arguments: + Parameters --------- col (pd.Series): Feature values. count_threshold (int): The number of instances below which a value is considered sparse. diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index d0ab384ab3..1a564a25ef 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -21,7 +21,7 @@ def validate(self, X, y): """ Check if the target data has a certain distribution. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 24cac97279..29aa3b2be1 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -23,7 +23,7 @@ class TargetLeakageDataCheck(DataCheck): Otherwise, if `method='pearson'`, it uses Pearson correlation and only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. - Arguments: + Parameters --------- pct_corr_threshold (float): The correlation threshold to be considered leakage. Defaults to 0.95. method (string): The method to determine correlation. Use 'mutual' for mutual information, otherwise 'pearson' for Pearson correlation. Defaults to 'mutual'. @@ -76,7 +76,7 @@ def validate(self, X, y): If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): The input features to check y (pd.Series, np.ndarray): The target data diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 87169ff7bd..3a65e536df 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -1,3 +1,4 @@ +"""Data check that checks if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems.""" from evalml.data_checks import ( DataCheck, DataCheckAction, @@ -19,9 +20,11 @@ class UniquenessDataCheck(DataCheck): - """Checks if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. + """ + Check if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. - Arguments: + Arguments + --------- problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems @@ -35,17 +38,21 @@ def __init__(self, problem_type, threshold=0.50): self.threshold = threshold def validate(self, X, y=None): - """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. + """ + Check if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. - Arguments: + Parameters + ---------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns: + Returns + ------- dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: + ------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], @@ -125,13 +132,14 @@ def validate(self, X, y=None): @staticmethod def uniqueness_score(col): """ - This function calculates a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. + Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl–Hirschman Index. - Arguments: + Parameters --------- col (pd.Series): Feature values. + Returns ------- (float): Uniqueness score. diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index bebb47d5f3..9c996f07a6 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -6,7 +6,7 @@ class EmptyDataChecks(DataChecks): """ An empty collection of data checks. - Arguments: + Parameters --------- data_checks (list (DataCheck)): Ignored. diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index 0db659045f..e64f50bc18 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -6,7 +6,7 @@ def load_churn(n_rows=None, verbose=True): """Load churn dataset, which can be used for binary classification problems. - Arguments: + Parameters --------- n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index 0571624b02..9a2cb42b31 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -8,7 +8,7 @@ def load_fraud(n_rows=None, verbose=True): The fraud dataset can be used for binary classification problems. - Arguments: + Parameters --------- n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py index 428cfa3f66..aac178c782 100644 --- a/evalml/exceptions/exceptions.py +++ b/evalml/exceptions/exceptions.py @@ -52,7 +52,7 @@ class EnsembleMissingPipelinesError(Exception): class PipelineScoreError(Exception): """An exception raised when a pipeline errors while scoring any objective in a list of objectives. - Arguments: + Parameters exceptions (dict): A dictionary mapping an objective name (str) to a tuple of the form (exception, traceback). All of the objectives that errored will be stored here. scored_successfully (dict): A dictionary mapping an objective name (str) to a score value. All of the objectives diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index c3cfe64f31..f232bdb593 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -4,7 +4,7 @@ def handle_model_family(model_family): """Handles model_family by either returning the ModelFamily or converting from a string. - Arguments: + Parameters model_family (str or ModelFamily): Model type that needs to be handled Returns: diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index a2dad56481..5f039ee83a 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -44,7 +44,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): """Confusion matrix for binary and multiclass classification. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -67,7 +67,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): def normalize_confusion_matrix(conf_mat, normalize_method="true"): """Normalize a confusion matrix. - Arguments: + Parameters conf_mat (pd.DataFrame or np.ndarray): Confusion matrix to normalize. normalize_method ({'true', 'pred', 'all'}): Normalization method. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -106,7 +106,7 @@ def graph_confusion_matrix( If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -184,7 +184,7 @@ def graph_confusion_matrix( def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): """Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. pos_label_idx (int): the column index corresponding to the positive class. If predicted probabilities are two-dimensional, this will be used to access the probabilities for the positive class. @@ -222,7 +222,7 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): """Generate and display a precision-recall plot. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. title_addition (str or None): If not None, append to plot title. Default None. @@ -261,7 +261,7 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): def roc_curve(y_true, y_pred_proba): """Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. @@ -310,7 +310,7 @@ def roc_curve(y_true, y_pred_proba): def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_addition=None): """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems. - Arguments: + Parameters y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. custom_class_labels (list or None): If not None, custom labels for classes. Default None. @@ -369,7 +369,7 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio def graph_permutation_importance(pipeline, X, y, objective, importance_threshold=0): """Generate a bar graph of the pipeline's permutation importance. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame): The input data used to score and compute permutation importance y (pd.Series): The target data @@ -428,7 +428,7 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Compute objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. - Arguments: + Parameters pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline X (pd.DataFrame): The input data used to compute objective score y (pd.Series): The target labels @@ -460,7 +460,7 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Generate a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame): The input data used to score and compute scores y (pd.Series): The target labels @@ -570,7 +570,7 @@ def partial_dependence( is calculated with the first feature in the y-axis and second feature in the x-axis. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at @@ -933,7 +933,7 @@ def graph_partial_dependence( ): """Create an one-way or two-way partial dependence plot. Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. @@ -1176,7 +1176,7 @@ def _calculate_axis_range(arr): def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): """Combine y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. - Arguments: + Parameters y_true (pd.Series, or np.ndarray): The real target values of the data y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1215,7 +1215,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): """Generate a scatter plot comparing the true and predicted values. Used for regression plotting. - Arguments: + Parameters y_true (pd.Series): The real target values of the data y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1298,7 +1298,7 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): """Return data for a fitted tree in a restructured format. - Arguments: + Parameters estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: @@ -1321,7 +1321,7 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): """Return data for a fitted pipeline with in a restructured format. - Arguments: + Parameters pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: @@ -1347,7 +1347,7 @@ def visualize_decision_tree( ): """Generate an image visualizing the decision tree. - Arguments: + Parameters estimator (ComponentBase): A fitted DecisionTree-based estimator. max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. @@ -1423,7 +1423,7 @@ def visualize_decision_tree( def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): """Get the data needed for the prediction_vs_actual_over_time plot. - Arguments: + Parameters pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1449,7 +1449,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): """Plot the target values and predictions against time on the x-axis. - Arguments: + Parameters pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1499,7 +1499,7 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): def get_linear_coefficients(estimator, features=None): """Return a dataframe showing the features with the greatest predictive power for a linear model. - Arguments: + Parameters estimator (Estimator): Fitted linear model family estimator. features (list[str]): List of feature names associated with the underlying data. @@ -1535,7 +1535,7 @@ def t_sne( ): """Get the transformed output after fitting X to the embedded space using t-SNE. - Arguments: + Parameters X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning @@ -1578,7 +1578,7 @@ def graph_t_sne( ): """Plot high dimensional data into lower dimensional space using t-SNE . - Arguments: + Parameters X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index 6f2f18fd1e..d9314a22ca 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -13,7 +13,7 @@ def calculate_permutation_importance( ): """Calculates permutation importance for features. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. @@ -77,7 +77,7 @@ def calculate_permutation_importance_one_column( ): """Calculates permutation importance for one column in the original dataframe. - Arguments: + Parameters pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. diff --git a/evalml/model_understanding/prediction_explanations/_algorithms.py b/evalml/model_understanding/prediction_explanations/_algorithms.py index e9c80a16d6..69a3c7631d 100644 --- a/evalml/model_understanding/prediction_explanations/_algorithms.py +++ b/evalml/model_understanding/prediction_explanations/_algorithms.py @@ -15,7 +15,7 @@ def _create_dictionary(shap_values, feature_names): """Creates a mapping from a feature name to a list of SHAP values for all points that were queried. - Arguments: + Parameters shap_values (np.ndarray): SHAP values stored in an array of shape (n_datapoints, n_features). feature_names (Iterable): Iterable storing the feature names as they are ordered in the dataset. @@ -34,7 +34,7 @@ def _create_dictionary(shap_values, feature_names): def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. - Arguments: + Parameters pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. @@ -147,7 +147,7 @@ def _aggreggate_shap_values_dict(values, provenance): This aggregation will happen for all features for which we know the provenance/lineage. Other features will be left as they are. - Arguments: + Parameters values (dict): A mapping of feature names to a list of SHAP values for each data point. provenance (dict): A mapping from a feature in the original data to the names of the features that were created from that feature. @@ -179,7 +179,7 @@ def _aggreggate_shap_values_dict(values, provenance): def _aggregate_shap_values(values, provenance): """Aggregates shap values across features created from a common feature. - Arguments: + Parameters values (dict): A mapping of feature names to a list of SHAP values for each data point. provenance (dict): A mapping from a feature in the original data to the names of the features that were created from that feature @@ -201,7 +201,7 @@ def _aggregate_shap_values(values, provenance): def _normalize_values_dict(values): """Normalizes SHAP values by dividing by the sum of absolute values for each feature. - Arguments: + Parameters values (dict): A mapping of feature names to a list of SHAP values for each data point. Returns: @@ -231,7 +231,7 @@ def _normalize_values_dict(values): def _normalize_shap_values(values): """Normalizes the SHAP values by the absolute value of their sum for each data point. - Arguments: + Parameters values (dict or list(dict)): Dictionary mapping feature name to list of values, or a list of dictionaries (each mapping a feature name to a list of values). diff --git a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py index a1ecb8106f..eb0aed8bed 100644 --- a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py +++ b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py @@ -27,7 +27,7 @@ def _report_creator_factory( ): """Get and initialize the report creator class given the ReportData and parameters passed in by the user. - Arguments: + Parameters data (_ReportData): Data about the problem (pipeline/predicted values, etc) needed for the report. report_type (str): Either "explain_predictions" or "explain_predictions_best_worst" output_format (str): Either "text" or "dict" - passed in by user. diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index cdf4dc6f61..119129c219 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -22,7 +22,7 @@ def _make_rows( ): """Makes the rows (one row for each feature) for the SHAP table. - Arguments: + Parameters shap_values (dict): Dictionary mapping the feature names to their SHAP values. In a multiclass setting, this dictionary for correspond to the SHAP values for a single class. normalized_values (dict): Normalized SHAP values. Same structure as shap_values parameter. @@ -128,7 +128,7 @@ def _make_text_table( ): """Make a table displaying the SHAP values for a prediction. - Arguments: + Parameters shap_values (dict): Dictionary mapping the feature names to their SHAP values. In a multiclass setting, this dictionary for correspond to the SHAP values for a single class. normalized_values (dict): Normalized SHAP values. Same structure as shap_values parameter. @@ -484,7 +484,7 @@ def _make_single_prediction_shap_table( ): """Creates table summarizing the top_k_features positive and top_k_features negative contributing features to the prediction of a single datapoint. - Arguments: + Parameters pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. pipeline_features (pd.DataFrame): Dataframe of features computed by the pipeline. input_features (pd.DataFrame): Dataframe of features passed to the pipeline. This is where the pipeline_features @@ -595,7 +595,7 @@ def make_text(self, rank): Differences between best/worst reports and reports where user manually specifies the input features subset are handled by formatting the value of the prefix parameter in the initialization. - Arguments: + Parameters rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ prefix = self.prefixes[(rank // self.n_indices)] @@ -605,7 +605,7 @@ def make_text(self, rank): def make_dict(self, rank): """Makes the heading section for reports formatted as dictionaries. - Arguments: + Parameters rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ prefix = self.prefixes[(rank // self.n_indices)] @@ -615,7 +615,7 @@ def make_dict(self, rank): def make_dataframe(self, rank): """Makes the heading section for reports formatted as a dataframe. - Arguments: + Parameters rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ return self.make_dict(rank) @@ -634,7 +634,7 @@ def __init__(self, error_name, y_pred_values): def make_text(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for classification problem best/worst reports formatted as text. - Arguments: + Parameters index (int): The index of the prediction in the dataset. y_pred (pd.Series): Pipeline predictions on the entire dataset. y_true (pd.Series): Targets for the entire dataset. @@ -687,7 +687,7 @@ def __init__(self, error_name, y_pred_values=None): def make_text(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for regression problem best/worst reports formatted as text. - Arguments: + Parameters index (int): The index of the prediction in the dataset. y_pred (pd.Series): Pipeline predictions on the entire dataset. y_true (pd.Series): Targets for the entire dataset. @@ -734,7 +734,7 @@ def make_text(self, index, pipeline, pipeline_features, input_features): Handling the differences in how the table is formatted between regression and classification problems is delegated to the _make_single_prediction_shap_table - Arguments: + Parameters index (int): The index of the prediction in the dataset. pipeline (PipelineBase): The pipeline to explain. pipeline_features (pd.DataFrame): The dataframe of features created by the pipeline. @@ -803,7 +803,7 @@ def __init__(self, heading_maker, predicted_values_maker, table_maker): def make_text(self, data): """Make a prediction explanation report that is formatted as text. - Arguments: + Parameters data (_ReportData): Data passed in by the user. Returns: @@ -834,7 +834,7 @@ def make_text(self, data): def make_dict(self, data): """Make a prediction explanation report that is formatted as a dictionary. - Arguments: + Parameters data (_ReportData): Data passed in by the user. Returns: diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index 64a2347359..be6277c377 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -47,7 +47,7 @@ def explain_predictions( XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. - Arguments: + Parameters pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Dataframe of input data to evaluate the pipeline on. y (pd.Series): Labels for the input data. @@ -141,7 +141,7 @@ def explain_predictions_best_worst( XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. - Arguments: + Parameters pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Input data to evaluate the pipeline on. y_true (pd.Series): True labels for the input data. @@ -268,7 +268,7 @@ def explain_predictions_best_worst( def abs_error(y_true, y_pred): """Computes the absolute error per data point for regression problems. - Arguments: + Parameters y_true (pd.Series): True labels. y_pred (pd.Series): Predicted values. @@ -281,7 +281,7 @@ def abs_error(y_true, y_pred): def cross_entropy(y_true, y_pred_proba): """Computes Cross Entropy Loss per data point for classification problems. - Arguments: + Parameters y_true (pd.Series): True labels encoded as ints. y_pred_proba (pd.DataFrame): Predicted probabilities. One column per class. diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index d067ef496d..a28bcc78fa 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -25,7 +25,7 @@ def can_optimize_threshold(cls): def optimize_threshold(self, ypred_proba, y_true, X=None): """Learn a binary classification threshold which optimizes the current objective. - Arguments: + Parameters ypred_proba (pd.Series): The classifier's predicted probabilities y_true (pd.Series): The ground truth for the predictions. X (pd.DataFrame, optional): Any extra columns that are needed from training data. @@ -57,7 +57,7 @@ def cost(threshold): def decision_function(self, ypred_proba, threshold=0.5, X=None): """Apply a learned threshold to predicted probabilities to get predicted classes. - Arguments: + Parameters ypred_proba (pd.Series, np.ndarray): The classifier's predicted probabilities threshold (float, optional): Threshold used to make a prediction. Defaults to 0.5. X (pd.DataFrame, optional): Any extra columns that are needed from training data. diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index ff45ff6a5a..caba301868 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -8,7 +8,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): """Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow. - Arguments: + Parameters true_positive (float): Cost associated with true positive predictions true_negative (float): Cost associated with true negative predictions false_positive (float): Cost associated with false positive predictions @@ -36,7 +36,7 @@ def __init__(self, true_positive, true_negative, false_positive, false_negative) def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): """Calculates cost-benefit of the using the predicted and true values. - Arguments: + Parameters y_predicted (pd.Series): Predicted labels y_true (pd.Series): True labels X (pd.DataFrame): Ignored. diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 5c0b181e63..20905be496 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -4,7 +4,7 @@ class FraudCost(BinaryClassificationObjective): """Score the percentage of money lost of the total transaction amount process due to fraud. - Arguments: + Parameters retry_percentage (float): What percentage of customers that will retry a transaction if it is declined. Between 0 and 1. Defaults to 0.5. interchange_fee (float): How much of each successful transaction you pay. @@ -36,7 +36,7 @@ def __init__( def objective_function(self, y_true, y_predicted, X, sample_weight=None): """Calculate amount lost to fraud per transaction given predictions, true values, and dataframe with transaction amount. - Arguments: + Parameters y_predicted (pd.Series): Predicted fraud labels y_true (pd.Series): True fraud labels X (pd.DataFrame): Data with transaction amounts diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py index 07f2d7e8ed..b9f6145cbd 100644 --- a/evalml/objectives/lead_scoring.py +++ b/evalml/objectives/lead_scoring.py @@ -6,7 +6,7 @@ class LeadScoring(BinaryClassificationObjective): """Lead scoring. - Arguments: + Parameters true_positives (int): Reward for a true positive. Defaults to 1. false_positives (int): Cost for a false positive. Should be negative. Defaults to -1. """ @@ -25,7 +25,7 @@ def __init__(self, true_positives=1, false_positives=-1): def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): """Calculate the profit per lead. - Arguments: + Parameters y_predicted (pd.Series): Predicted labels y_true (pd.Series): True labels X (pd.DataFrame): Ignored. diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 6aca76d283..414c26dc47 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -63,7 +63,7 @@ def expected_range(cls): def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): """Computes the relative value of the provided predictions compared to the actual labels, according a specified metric. - Arguments: + Parameters y_predicted (pd.Series): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score @@ -84,7 +84,7 @@ def positive_only(cls): def score(self, y_true, y_predicted, X=None, sample_weight=None): """Returns a numerical score indicating performance based on the differences between the predicted and actual values. - Arguments: + Parameters y_predicted (pd.Series): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score @@ -106,7 +106,7 @@ def score(self, y_true, y_predicted, X=None, sample_weight=None): def _standardize_input_type(input_data): """Standardize input to pandas for scoring. - Arguments: + Parameters input_data (list, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities Returns: @@ -126,7 +126,7 @@ def _standardize_input_type(input_data): def validate_inputs(self, y_true, y_predicted): """Validates the input based on a few simple checks. - Arguments: + Parameters y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] @@ -156,7 +156,7 @@ def validate_inputs(self, y_true, y_predicted): def calculate_percent_difference(cls, score, baseline_score): """Calculate the percent difference between scores. - Arguments: + Parameters score (float): A score. Output of the score method of this objective. baseline_score (float): A score. Output of the score method of this objective. In practice, this is the score achieved on this objective with a baseline estimator. diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index b75d0c61d5..3261e28c14 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -18,7 +18,7 @@ class SensitivityLowAlert(BinaryClassificationObjective): def __init__(self, alert_rate=0.01): """Create instance of SensitivityLowAlert. - Arguments: + Parameters alert_rate (float): percentage of top scores to classify as high risk """ if (alert_rate > 1) or (alert_rate < 0): @@ -29,7 +29,7 @@ def __init__(self, alert_rate=0.01): def decision_function(self, ypred_proba, **kwargs): """Determine if an observation is high risk given an alert rate. - Arguments: + Parameters ypred_proba (pd.Series): Predicted probabilities """ @@ -48,7 +48,7 @@ def decision_function(self, ypred_proba, **kwargs): def objective_function(self, y_true, y_predicted, **kwargs): """Calculate sensitivity across all predictions, using the top alert_rate percent of observations as the predicted positive class. - Arguments: + Parameters y_true (pd.Series): True labels y_predicted (pd.Series): Predicted labels based on alert_rate diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index 16327f532d..595dfea212 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -68,7 +68,7 @@ def get_core_objective_names(): def get_objective(objective, return_instance=False, **kwargs): """Returns the Objective class corresponding to a given objective name. - Arguments: + Parameters objective (str or ObjectiveBase): Name or instance of the objective class. return_instance (bool): Whether to return an instance of the objective. This only applies if objective is of type str. Note that the instance will be initialized with default arguments. @@ -113,7 +113,7 @@ def get_core_objectives(problem_type): Core objectives are designed to work out-of-the-box for any dataset. - Arguments: + Parameters problem_type (str/ProblemTypes): Type of problem Returns: diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py index c169b502d4..55467da06e 100644 --- a/evalml/pipelines/binary_classification_pipeline.py +++ b/evalml/pipelines/binary_classification_pipeline.py @@ -13,7 +13,7 @@ class BinaryClassificationPipeline( ): """Pipeline subclass for all binary classification pipelines. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -31,7 +31,7 @@ class BinaryClassificationPipeline( def _predict(self, X, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions @@ -55,7 +55,7 @@ def _predict(self, X, objective=None): def predict_proba(self, X): """Make probability estimates for labels. Assumes that the column at index 1 represents the positive label case. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] Returns: diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 0438ac425d..10382cd220 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -42,7 +42,7 @@ def _select_y_pred_for_score(self, X, y, y_pred, y_pred_proba, objective): def optimize_threshold(self, X, y, y_pred_proba, objective): """Optimize the pipeline threshold given the objective to use. Only used for binary problems with objectives whose thresholds can be tuned. - Arguments: + Parameters X (pd.DataFrame): Input features y (pd.Series): Input target values y_pred_proba (pd.Series): The predicted probabilities of the target outputted by the pipeline diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 1988169589..4e8749745b 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -8,7 +8,7 @@ class ClassificationPipeline(PipelineBase): """Pipeline subclass for all classification pipelines. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -38,7 +38,7 @@ def __init__( def fit(self, X, y): """Build a classification model. For string and categorical targets, classes are sorted by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training labels of length [n_samples] @@ -80,7 +80,7 @@ def classes_(self): def _predict(self, X, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions @@ -92,7 +92,7 @@ def _predict(self, X, objective=None): def predict(self, X, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions @@ -108,7 +108,7 @@ def predict(self, X, objective=None): def predict_proba(self, X): """Make probability estimates for labels. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] Returns: @@ -131,7 +131,7 @@ def predict_proba(self, X): def score(self, X, y, objectives): """Evaluate model performance on objectives. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): True labels of length [n_samples] objectives (list): List of objectives to score diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 024c5a2eb2..0e4f1543b1 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -28,7 +28,7 @@ class ComponentGraph: """Component graph for a pipeline as a directed acyclic graph (DAG). - Arguments: + Parameters component_dict (dict): A dictionary which specifies the components and edges between components that should be used to create the component graph. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -138,7 +138,7 @@ def default_parameters(self): def instantiate(self, parameters): """Instantiates all uninstantiated components within the graph using the given parameters. An error will be raised if a component is already instantiated but the parameters dict contains arguments for that component. - Arguments: + Parameters parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} or None implies using all default values for component parameters. If a component in the component graph is already instantiated, it will not use any of its parameters defined in this dictionary. @@ -176,7 +176,7 @@ def instantiate(self, parameters): def fit(self, X, y): """Fit each component in the graph. - Arguments: + Parameters X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. """ @@ -189,7 +189,7 @@ def fit(self, X, y): def fit_features(self, X, y): """Fit all components save the final one, usually an estimator. - Arguments: + Parameters X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. @@ -201,7 +201,7 @@ def fit_features(self, X, y): def compute_final_component_features(self, X, y=None): """Transform all components save the final one, and gathers the data from any number of parents to get all the information that should be fed to the final component. - Arguments: + Parameters X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. @@ -213,7 +213,7 @@ def compute_final_component_features(self, X, y=None): def _fit_transform_features_helper(self, needs_fitting, X, y=None): """Transform all components save the final one, and returns the data that should be fed to the final component, usually an estimator. - Arguments: + Parameters needs_fitting (boolean): Determines if components should be fit. X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. @@ -260,7 +260,7 @@ def _consolidate_inputs_for_component( def transform(self, X, y=None): """Transform the input using the component graph. - Arguments: + Parameters X (pd.DataFrame): Input features of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. @@ -286,7 +286,7 @@ def transform(self, X, y=None): def predict(self, X): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame): Input features of shape [n_samples, n_features]. Returns: @@ -306,7 +306,7 @@ def predict(self, X): def _compute_features(self, component_list, X, y=None, fit=False): """Transforms the data by applying the given components. - Arguments: + Parameters component_list (list): The list of component names to compute. X (pd.DataFrame): Input data to the pipeline to transform. y (pd.Series): The target training data of length [n_samples]. @@ -387,7 +387,7 @@ def _get_feature_provenance(self, input_feature_names): If a feature is then calculated from feature 'a', e.g. 'a_squared', then the provenance would instead be {'cats': ['a', 'a_squared', 'b']}. - Arguments: + Parameters input_feature_names (list(str)): Names of the features in the input dataframe. Returns: @@ -442,7 +442,7 @@ def _get_feature_provenance(self, input_feature_names): def get_component(self, component_name): """Retrieves a single component object from the graph. - Arguments: + Parameters component_name (str): Name of the component to retrieve Returns: @@ -483,7 +483,7 @@ def get_estimators(self): def get_inputs(self, component_name): """Retrieves all inputs for a given component. - Arguments: + Parameters component_name (str): Name of the component to look up. Returns: @@ -500,7 +500,7 @@ def get_inputs(self, component_name): def describe(self, return_dict=False): """Outputs component graph details including component parameters. - Arguments: + Parameters return_dict (bool): If True, return dictionary of information about component graph. Defaults to False. Returns: @@ -523,7 +523,7 @@ def describe(self, return_dict=False): def graph(self, name=None, graph_format=None): """Generate an image representing the component graph. - Arguments: + Parameters name (str): Name of the graph. Defaults to None. graph_format (str): file format to save the graph in. Defaults to None. @@ -682,7 +682,7 @@ def inverse_transform(self, y): Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). - Arguments: + Parameters y: (pd.Series): Final component features """ data_to_transform = infer_feature_types(y) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 543d7299fb..bd346428af 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -19,7 +19,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): """Base class for all components. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -30,7 +30,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -118,7 +118,7 @@ def clone(self): def fit(self, X, y=None): """Fits component to data. - Arguments: + Parameters X (list, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (list, pd.Series, np.ndarray, optional): The target training data of length [n_samples] @@ -139,7 +139,7 @@ def fit(self, X, y=None): def describe(self, print_name=False, return_dict=False): """Describe a component and its parameters. - Arguments: + Parameters print_name(bool, optional): whether to print name of component return_dict(bool, optional): whether to return description as dictionary in the format {"name": name, "parameters": parameters} @@ -162,7 +162,7 @@ def describe(self, print_name=False, return_dict=False): def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves component at file path. - Arguments: + Parameters file_path (str): Location to save file pickle_protocol (int): The pickle data stream format. @@ -176,7 +176,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): def load(file_path): """Loads component at file path. - Arguments: + Parameters file_path (str): Location to load file Returns: diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index 3ded6e7890..e780b7aaf8 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleBase(Estimator): """Stacked Ensemble Base Class. - Arguments: + Parameters input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The estimator used to combine the base estimators. diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py index b1e2300870..82a6c02d03 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleClassifier(SklearnStackedEnsembleBase): """Scikit-learn Stacked Ensemble Classifier. - Arguments: + Parameters input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The classifier used to combine the base estimators. If None, uses LogisticRegressionClassifier. diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py index 026fd2cfe3..772a486daf 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleRegressor(SklearnStackedEnsembleBase): """Scikit-learn Stacked Ensemble Regressor. - Arguments: + Parameters input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The regressor used to combine the base estimators. If None, uses LinearRegressor. diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 909744c722..732f1037f8 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -12,7 +12,7 @@ class BaselineClassifier(Estimator): This is useful as a simple baseline classifier to compare with other classifiers. - Arguments: + Parameters strategy (str): Method used to predict. Valid options are "mode", "random" and "random_weighted". Defaults to "mode". random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index 99520f618d..6c30212e10 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -17,7 +17,7 @@ class CatBoostClassifier(Estimator): For more information, check out https://catboost.ai/ - Arguments: + Parameters n_estimators (float): The maximum number of trees to build. Defaults to 10. eta (float): The learning rate. Defaults to 0.03. max_depth (int): The maximum tree depth for base learners. Defaults to 6. diff --git a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py index e6db5d16fd..9b6821561b 100644 --- a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py @@ -9,7 +9,7 @@ class DecisionTreeClassifier(Estimator): """Decision Tree Classifier. - Arguments: + Parameters criterion ({"gini", "entropy"}): The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Defaults to "gini". diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py index 58354738d5..77ef28c904 100644 --- a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py @@ -12,7 +12,7 @@ class ElasticNetClassifier(Estimator): """Elastic Net Classifier. Uses Logistic Regression with elasticnet penalty as the base estimator. - Arguments: + Parameters penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "elasticnet". C (float): Inverse of regularization strength. Must be a positive float. Defaults to 1.0. l1_ratio (float): The mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'. Setting l1_ratio=0 is equivalent to using penalty='l2', diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py index 4d550332e2..abed596522 100644 --- a/evalml/pipelines/components/estimators/classifiers/et_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/et_classifier.py @@ -9,7 +9,7 @@ class ExtraTreesClassifier(Estimator): """Extra Trees Classifier. - Arguments: + Parameters n_estimators (float): The number of trees in the forest. Defaults to 100. max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split: diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py index f74b395aa4..dec76cd267 100644 --- a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py @@ -10,7 +10,7 @@ class KNeighborsClassifier(Estimator): """K-Nearest Neighbors Classifier. - Arguments: + Parameters n_neighbors (int): Number of neighbors to use by default. Defaults to 5. weights ({‘uniform’, ‘distance’} or callable): Weight function used in prediction. Can be: diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index fde0c6ee75..a000ae3c7c 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -20,7 +20,7 @@ class LightGBMClassifier(Estimator): """LightGBM Classifier. - Arguments: + Parameters boosting_type (string): Type of boosting to use. Defaults to "gbdt". - 'gbdt' uses traditional Gradient Boosting Decision Tree - "dart", uses Dropouts meet Multiple Additive Regression Trees diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py index fc02f27a8d..5bbbd603f8 100644 --- a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py @@ -10,7 +10,7 @@ class LogisticRegressionClassifier(Estimator): """Logistic Regression Classifier. - Arguments: + Parameters penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "l2". C (float): Inverse of regularization strength. Must be a positive float. Defaults to 1.0. multi_class ({"auto", "ovr", "multinomial"}): If the option chosen is "ovr", then a binary problem is fit for each label. diff --git a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py index 7f924c9540..c1667a8c2f 100644 --- a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py @@ -9,7 +9,7 @@ class RandomForestClassifier(Estimator): """Random Forest Classifier. - Arguments: + Parameters n_estimators (float): The number of trees in the forest. Defaults to 100. max_depth (int): Maximum tree depth for base learners. Defaults to 6. n_jobs (int or None): Number of jobs to run in parallel. -1 uses all processes. Defaults to -1. diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index b5d5c9554b..797fbb83a6 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -10,7 +10,7 @@ class SVMClassifier(Estimator): """Support Vector Machine Classifier. - Arguments: + Parameters C (float): The regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. Defaults to 1.0. kernel ({"poly", "rbf", "sigmoid"}): Specifies the kernel type to be used in the algorithm. Defaults to "rbf". diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index f829290d5b..e30550260f 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -12,7 +12,7 @@ class XGBoostClassifier(Estimator): """XGBoost Classifier. - Arguments: + Parameters eta (float): Boosting learning rate. Defaults to 0.1. max_depth (int): Maximum tree depth for base learners. Defaults to 6. min_child_weight (float): Minimum sum of instance weight (hessian) needed in a child. Defaults to 1.0 diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 0c2ac012cc..692eb59f36 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -19,7 +19,7 @@ class Estimator(ComponentBase): To see some examples, check out the definitions of any Estimator component. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -65,7 +65,7 @@ def fit(self, X, y=None): def predict(self, X): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame, np.ndarray): Data of shape [n_samples, n_features] Returns: @@ -85,7 +85,7 @@ def predict(self, X): def predict_proba(self, X): """Make probability estimates for labels. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Features Returns: diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 6bf2320399..48d4619539 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -13,7 +13,7 @@ class ARIMARegressor(Estimator): Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI. - Arguments: + Parameters date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term, 't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index 8b1d8d8ca3..400d6d2841 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -10,7 +10,7 @@ class BaselineRegressor(Estimator): """Baseline regressor that uses a simple strategy to make predictions. This is useful as a simple baseline regressor to compare with other regressors. - Arguments: + Parameters strategy (str): Method used to predict. Valid options are "mean", "median". Defaults to "mean". random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index 2663efc0e2..79a76a5cf1 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -14,7 +14,7 @@ class CatBoostRegressor(Estimator): For more information, check out https://catboost.ai/ - Arguments: + Parameters n_estimators (float): The maximum number of trees to build. Defaults to 10. eta (float): The learning rate. Defaults to 0.03. max_depth (int): The maximum tree depth for base learners. Defaults to 6. diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index 0eae370573..9b4d95e47d 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -9,7 +9,7 @@ class DecisionTreeRegressor(Estimator): """Decision Tree Regressor. - Arguments: + Parameters criterion ({"mse", "friedman_mse", "mae", "poisson"}): The function to measure the quality of a split. Supported criteria are: diff --git a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py index bdd8baf08a..f54c772cbf 100644 --- a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py @@ -9,7 +9,7 @@ class ElasticNetRegressor(Estimator): """Elastic Net Regressor. - Arguments: + Parameters alpha (float): Constant that multiplies the penalty terms. Defaults to 0.0001. l1_ratio (float): The mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'. Setting l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'. For 0 < l1_ratio <1, the penalty is a combination of L1 and L2. Defaults to 0.15. diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py index 3b9c947f69..fa28354aec 100644 --- a/evalml/pipelines/components/estimators/regressors/et_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/et_regressor.py @@ -9,7 +9,7 @@ class ExtraTreesRegressor(Estimator): """Extra Trees Regressor. - Arguments: + Parameters n_estimators (float): The number of trees in the forest. Defaults to 100. max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split: diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index ac255ddead..37bfe0163d 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -18,7 +18,7 @@ class LightGBMRegressor(Estimator): """LightGBM Regressor. - Arguments: + Parameters boosting_type (string): Type of boosting to use. Defaults to "gbdt". - 'gbdt' uses traditional Gradient Boosting Decision Tree - "dart", uses Dropouts meet Multiple Additive Regression Trees diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index eafcb69635..f8f9c5fe82 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -8,7 +8,7 @@ class LinearRegressor(Estimator): """Linear Regressor. - Arguments: + Parameters fit_intercept (boolean): Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). Defaults to True. diff --git a/evalml/pipelines/components/estimators/regressors/rf_regressor.py b/evalml/pipelines/components/estimators/regressors/rf_regressor.py index c24016ae0b..14e664d544 100644 --- a/evalml/pipelines/components/estimators/regressors/rf_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/rf_regressor.py @@ -9,7 +9,7 @@ class RandomForestRegressor(Estimator): """Random Forest Regressor. - Arguments: + Parameters n_estimators (float): The number of trees in the forest. Defaults to 100. max_depth (int): Maximum tree depth for base learners. Defaults to 6. n_jobs (int or None): Number of jobs to run in parallel. -1 uses all processes. Defaults to -1. diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index 7bb3f5e07a..49cb51ca03 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -10,7 +10,7 @@ class SVMRegressor(Estimator): """Support Vector Machine Regressor. - Arguments: + Parameters C (float): The regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. Defaults to 1.0. kernel ({"poly", "rbf", "sigmoid"}): Specifies the kernel type to be used in the algorithm. Defaults to "rbf". diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index a4c0f4fd70..b9903fc729 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -12,7 +12,7 @@ class TimeSeriesBaselineEstimator(Estimator): This is useful as a simple baseline estimator for time series problems. - Arguments: + Parameters gap (int): Gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period. Defaults to 1. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index 43ef200cf8..51c70a20f6 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -12,7 +12,7 @@ class XGBoostRegressor(Estimator): """XGBoost Regressor. - Arguments: + Parameters eta (float): Boosting learning rate. Defaults to 0.1. max_depth (int): Maximum tree depth for base learners. Defaults to 6. min_child_weight (float): Minimum sum of instance weight (hessian) needed in a child. Defaults to 1.0 diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index d60cfd6b94..d457d71dff 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -7,7 +7,7 @@ class ColumnSelector(Transformer): """Initalizes an transformer that drops specified columns in input data. - Arguments: + Parameters columns (list(string)): List of column names, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -39,7 +39,7 @@ def _modify_columns(self, cols, X, y=None): def fit(self, X, y=None): """Fits the transformer by checking if column names are present in the dataset. - Arguments: + Parameters X (pd.DataFrame): Data to check. y (pd.Series, optional): Targets. @@ -61,7 +61,7 @@ def transform(self, X, y=None): class DropColumns(ColumnSelector): """Drops specified columns in input data. - Arguments: + Parameters columns (list(string)): List of column names, used to determine which columns to drop. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -77,7 +77,7 @@ def _modify_columns(self, cols, X, y=None): def transform(self, X, y=None): """Transforms data X by dropping columns. - Arguments: + Parameters X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. @@ -90,7 +90,7 @@ def transform(self, X, y=None): class SelectColumns(ColumnSelector): """Selects specified columns in input data. - Arguments: + Parameters columns (list(string)): List of column names, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -106,7 +106,7 @@ def _modify_columns(self, cols, X, y=None): def transform(self, X, y=None): """Transforms data X by selecting columns. - Arguments: + Parameters X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. @@ -119,7 +119,7 @@ def transform(self, X, y=None): class SelectByType(ColumnSelector): """Selects columns by specified Woodwork logical type or semantic tag in input data. - Arguments: + Parameters column_types (string, ww.LogicalType, list(string), list(ww.LogicalType)): List of Woodwork types or tags, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index 16fb1047b1..2af4a8cf46 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -12,7 +12,7 @@ class LinearDiscriminantAnalysis(Transformer): """Reduces the number of features by using Linear Discriminant Analysis. - Arguments: + Parameters n_components (int): The number of features to maintain after computation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 84c43ce9cf..96cf0b6a10 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -13,7 +13,7 @@ class PCA(Transformer): """Reduces the number of features by using Principal Component Analysis (PCA). - Arguments: + Parameters variance (float): The percentage of the original data variance that should be preserved when reducing the number of features. Defaults to 0.95. n_components (int): The number of features to maintain after computing SVD. Defaults to None, but will override diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 6d5ed612c3..1dc3aeaee6 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -20,7 +20,7 @@ class OneHotEncoderMeta(ComponentBaseMeta): class OneHotEncoder(Transformer, metaclass=OneHotEncoderMeta): """A transformer that encodes categorical features in a one-hot numeric array. - Arguments: + Parameters top_n (int): Number of categories per column to encode. If None, all categories will be encoded. Otherwise, the `n` most frequent will be encoded and all others will be dropped. Defaults to 10. features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. @@ -158,7 +158,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """One-hot encode the input data. - Arguments: + Parameters X (pd.DataFrame): Features to one-hot encode. y (pd.Series): Ignored. @@ -201,7 +201,7 @@ def _handle_parameter_handle_missing(self, X): def categories(self, feature_name): """Returns a list of the unique categories to be encoded for the particular feature, in order. - Arguments: + Parameters feature_name (str): the name of any feature provided to one-hot encoder during fit Returns: np.ndarray: the unique categories, in the same dtype as they were provided during fit diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index 96c7474f08..520823584e 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -16,7 +16,7 @@ class TargetEncoder(Transformer, metaclass=OneHotEncoderMeta): """A transformer that encodes categorical features into target encodings. - Arguments: + Parameters cols (list): Columns to encode. If None, all string columns will be encoded, otherwise only the columns provided will be encoded. Defaults to None smoothing (float): The smoothing factor to apply. The larger this value is, the more influence the expected target value has diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index 0c6b8393b4..76971a509a 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -11,7 +11,7 @@ class FeatureSelector(Transformer): """Selects top features based on importance weights. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -35,7 +35,7 @@ def get_names(self): def transform(self, X, y=None): """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception. - Arguments: + Parameters X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. Ignored. diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py index f2f7fd9b76..245c19c418 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py @@ -9,7 +9,7 @@ class RFClassifierSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest classifier. - Arguments: + Parameters number_features (int): The maximum number of features to select. If both percent_features and number_features are specified, take the greater number of features. Defaults to 0.5. Defaults to None. diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py index ecd163bd62..7dff11068a 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py @@ -9,7 +9,7 @@ class RFRegressorSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest regressor. - Arguments: + Parameters number_features (int): The maximum number of features to select. If both percent_features and number_features are specified, take the greater number of features. Defaults to 0.5. Defaults to None. diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index 868b66181c..28f57f1ed3 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -11,7 +11,7 @@ class Imputer(Transformer): """Imputes missing data according to a specified imputation strategy. - Arguments: + Parameters categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "most_frequent" and "constant". numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include "mean", "median", "most_frequent", and "constant". categorical_fill_value (string): When categorical_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with the string "missing_value". @@ -78,7 +78,7 @@ def __init__( def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. - Arguments: + Parameters X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] @@ -108,7 +108,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. - Arguments: + Parameters X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index e78bf6bb46..8effc33331 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -11,7 +11,7 @@ class PerColumnImputer(Transformer): """Imputes missing data according to a specified imputation strategy per column. - Arguments: + Parameters impute_strategies (dict): Column and {"impute_strategy": strategy, "fill_value":value} pairings. Valid values for impute strategy include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to None, which uses "most_frequent" for all columns. @@ -54,7 +54,7 @@ def __init__( def fit(self, X, y=None): """Fits imputers on input data. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. @@ -81,7 +81,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms input data by imputing missing values. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index f8694afdd2..57777feacd 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -12,7 +12,7 @@ class SimpleImputer(Transformer): """Imputes missing data according to a specified imputation strategy. - Arguments: + Parameters impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. @@ -40,7 +40,7 @@ def __init__( def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] @@ -71,7 +71,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. - Arguments: + Parameters X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. @@ -107,7 +107,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Fits on X and transforms X. - Arguments: + Parameters X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index 74c4615839..05899b6b32 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -39,7 +39,7 @@ def _check_for_fit(self, X=None, y=None): class TargetImputer(Transformer, metaclass=TargetImputerMeta): """Imputes missing target data according to a specified imputation strategy. - Arguments: + Parameters impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent". fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. @@ -68,7 +68,7 @@ def __init__( def fit(self, X, y): """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. y (pd.Series, optional): The target training data of length [n_samples]. @@ -92,7 +92,7 @@ def fit(self, X, y): def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. - Arguments: + Parameters X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. @@ -120,7 +120,7 @@ def transform(self, X, y): def fit_transform(self, X, y): """Fits on and transforms the input target data. - Arguments: + Parameters X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index 11159eb0df..e7c43201c5 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -58,7 +58,7 @@ def _extract_hour(col, encode_as_categories=False): class DateTimeFeaturizer(Transformer): """Transformer that can automatically extract features from datetime columns. - Arguments: + Parameters features_to_extract (list): List of features to extract. Valid options include "year", "month", "day_of_week", "hour". Defaults to None. encode_as_categories (bool): Whether day-of-week and month features should be encoded as pandas "category" dtype. This allows OneHotEncoders to encode these features. Defaults to False. @@ -119,7 +119,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns. - Arguments: + Parameters X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index 3f9d86e0ca..82421fea68 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -9,7 +9,7 @@ class DelayedFeatureTransformer(Transformer): """Transformer that delays input features and target variable for time series problems. - Arguments: + Parameters date_index (str): Name of the column containing the datetime information used to order the data. Ignored. max_delay (int): Maximum number of time units to delay each feature. Defaults to 2. delay_features (bool): Whether to delay the input features. Defaults to True. @@ -57,7 +57,7 @@ def __init__( def fit(self, X, y=None): """Fits the DelayFeatureTransformer. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] @@ -98,7 +98,7 @@ def transform(self, X, y=None): If y is not None, it will also compute the delayed values for the target variable. - Arguments: + Parameters X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index 521fa6bfd7..cf7c778783 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -5,7 +5,7 @@ class DropNullColumns(Transformer): """Transformer to drop features whose percentage of NaN values exceeds a specified threshold. - Arguments: + Parameters pct_null_threshold(float): The percentage of NaN values in an input feature to drop. Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values. If equal to 1.0, will drop columns with all null values. Defaults to 0.95. @@ -43,7 +43,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by dropping columns that exceed the threshold of null values. - Arguments: + Parameters X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py index d68c9ecc5a..b93eeb43a2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py @@ -5,7 +5,7 @@ class DropRowsTransformer(Transformer): """Transformer to drop rows specified by row indices. - Arguments: + Parameters indices_to_drop (list): List of indices to drop in the input data. Defaults to None. random_seed (int): Seed for the random number generator. Is not used by this component. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 9fd4a1f63c..e875af12ab 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -10,7 +10,7 @@ class DFSTransformer(Transformer): """Featuretools DFS component that generates features for the input features. - Arguments: + Parameters index (string): The name of the column that contains the indices. If no column with this name exists, then featuretools.EntitySet() creates a column with this name to serve as the index column. Defaults to 'index'. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -46,7 +46,7 @@ def _make_entity_set(self, X): def fit(self, X, y=None): """Fits the DFSTransformer Transformer component. - Arguments: + Parameters X (pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features] y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] @@ -64,7 +64,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Computes the feature matrix for the input X using featuretools' dfs algorithm. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data to transform. Has shape [n_samples, n_features] y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py index 75b65c1fdf..d7e6bc9026 100644 --- a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py @@ -20,7 +20,7 @@ def __init__(self, random_seed=0): def fit(self, X, y=None): """Fits the LogTransformer. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Ignored. y (pd.Series, optional): Ignored. @@ -32,7 +32,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Log transforms the target variable. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target data to log transform. @@ -52,7 +52,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Log transforms the target variable. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to log transform. diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 2fc239355a..b0a37f16bd 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -12,7 +12,7 @@ class LSA(TextTransformer): """Transformer to calculate the Latent Semantic Analysis Values of text input. - Arguments: + Parameters random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -42,7 +42,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by applying the LSA pipeline. - Arguments: + Parameters X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index 92b618c572..b811a675ab 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -10,7 +10,7 @@ class PolynomialDetrender(TargetTransformer): """Removes trends from time series by fitting a polynomial to the data. - Arguments: + Parameters degree (int): Degree for the polynomial. If 1, linear model is fit to the data. If 2, quadratic model is fit, etc. Defaults to 1. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -49,7 +49,7 @@ def __init__(self, degree=1, random_seed=0, **kwargs): def fit(self, X, y=None): """Fits the PolynomialDetrender. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -65,7 +65,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Removes fitted trend from target variable. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -83,7 +83,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Removes fitted trend from target variable. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -96,7 +96,7 @@ def fit_transform(self, X, y=None): def inverse_transform(self, y): """Adds back fitted trend to target variable. - Arguments: + Parameters X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable. diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index c6312b11b3..6d54523ff4 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -19,7 +19,7 @@ class TextFeaturizer(TextTransformer): LSA (Latent Semantic Analysis). Calling transform on this component will replace any text columns in the given dataset with these numeric columns. - Arguments: + Parameters random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -75,7 +75,7 @@ def _make_entity_set(self, X, text_columns): def fit(self, X, y=None): """Fits component to data. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] @@ -115,7 +115,7 @@ def _get_primitives_provenance(features): def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns. - Arguments: + Parameters X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py index 104f8c8b01..cd83db602a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py @@ -7,7 +7,7 @@ class TextTransformer(Transformer): """Base class for all transformers working with text features. - Arguments: + Parameters component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py index fff7319de4..3046102a8e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py +++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py @@ -112,7 +112,7 @@ def _get_feature_provenance(self): class EmailFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from emails. - Arguments: + Parameters random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -135,7 +135,7 @@ def _get_feature_types_for_featuretools(self, X): class URLFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from URL. - Arguments: + Parameters random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index ab82a1d033..9a6b4d26f7 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -10,7 +10,7 @@ class BaseSampler(Transformer): """Base Sampler component. Used as the base class of all sampler components. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -22,7 +22,7 @@ class BaseSampler(Transformer): def fit(self, X, y): """Fits the sampler to the data. - Arguments: + Parameters X (pd.DataFrame): Input features. y (pd.Series): Target. @@ -39,7 +39,7 @@ def fit(self, X, y): def _initialize_sampler(self, X, y): """Helper function to initialize the sampler component object. - Arguments: + Parameters X (pd.DataFrame): Features. y (pd.Series): The target data. """ @@ -47,7 +47,7 @@ def _initialize_sampler(self, X, y): def _prepare_data(self, X, y): """Transforms the input data to pandas data structure that our sampler can ingest. - Arguments: + Parameters X (pd.DataFrame): Training features. y (pd.Series): Target. @@ -63,7 +63,7 @@ def _prepare_data(self, X, y): def transform(self, X, y=None): """Transforms the input data by sampling the data. - Arguments: + Parameters X (pd.DataFrame): Training features. y (pd.Series): Target. @@ -77,7 +77,7 @@ def transform(self, X, y=None): def _convert_dictionary(self, sampling_dict, y): """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. - Arguments: + Parameters sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. @@ -111,7 +111,7 @@ def _convert_dictionary(self, sampling_dict, y): def _dictionary_to_params(self, sampling_dict, y): """If a sampling ratio dictionary is provided, add the updated sampling dictionary to the parameters and return the updated parameter dictionary. Otherwise, simply return the current parameters. - Arguments: + Parameters sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. @@ -133,7 +133,7 @@ def fit_transform(self, X, y): class BaseOversampler(BaseSampler): """Base Oversampler component. Used as the base class of all imbalance-learn oversampler components. - Arguments: + Parameters sampler (obj): Sampler object to use. sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class @@ -176,7 +176,7 @@ def __init__( def _initialize_sampler(self, X, y): """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. If a sampler_ratio_dict is provided, we will opt to use that. Otherwise, we use will create the sampler_ratio_dict dictionary. - Arguments: + Parameters X (pd.DataFrame): Input features. y (pd.Series): Target. """ diff --git a/evalml/pipelines/components/transformers/samplers/oversamplers.py b/evalml/pipelines/components/transformers/samplers/oversamplers.py index a7c9c429e4..4efb508ebc 100644 --- a/evalml/pipelines/components/transformers/samplers/oversamplers.py +++ b/evalml/pipelines/components/transformers/samplers/oversamplers.py @@ -7,7 +7,7 @@ class SMOTEOversampler(BaseOversampler): """SMOTE Oversampler component. Works on numerical datasets only. This component is only run during training and not during predict. - Arguments: + Parameters sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. @@ -42,7 +42,7 @@ def __init__( class SMOTENCOversampler(BaseOversampler): """SMOTENC Oversampler component. Uses SMOTENC to generate synthetic samples. Works on a mix of numerical and categorical columns. Input data must be Woodwork type, and this component is only run during training and not during predict. - Arguments: + Parameters sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. @@ -93,7 +93,7 @@ def fit(self, X, y): class SMOTENOversampler(BaseOversampler): """SMOTEN Oversampler component. Uses SMOTEN to generate synthetic samples. Works for purely categorical datasets. This component is only run during training and not during predict. - Arguments: + Parameters sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index 091ce9530f..39f30d2893 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -13,7 +13,7 @@ class Undersampler(BaseSampler): This component is only run during training and not during predict. - Arguments: + Parameters sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. sampling_ratio_dict (dict): A dictionary specifying the desired balanced ratio for each target value. For instance, in a binary case where class 1 is the minority, we could specify: @@ -55,7 +55,7 @@ def __init__( def _initialize_sampler(self, X, y): """Helper function to initialize the undersampler component object. - Arguments: + Parameters y (pd.Series): The target data """ param_dic = self._dictionary_to_params( diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index 168c6735ce..ae8f2c544e 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -12,7 +12,7 @@ class StandardScaler(Transformer): """A transformer that standardizes input features by removing the mean and scaling to unit variance. - Arguments: + Parameters random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 13d8316f01..c8b6c55d24 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -22,7 +22,7 @@ class Transformer(ComponentBase): To see some examples, check out the definitions of any Transformer component. - Arguments: + Parameters parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -36,7 +36,7 @@ class Transformer(ComponentBase): def transform(self, X, y=None): """Transforms data X. - Arguments: + Parameters X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. @@ -60,7 +60,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Fits on X and transforms X. - Arguments: + Parameters X (pd.DataFrame): Data to fit and transform y (pd.Series): Target data @@ -95,7 +95,7 @@ class TargetTransformer(Transformer): def inverse_transform(self, y): """Inverts the transformation done by the transform method. - Arguments: + Parameters y (pd.Series): Target transformed by this component. Returns: diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index b49dbf8c2e..0206a82eab 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -34,7 +34,7 @@ def all_components(): def allowed_model_families(problem_type): """List the model types allowed for a particular problem type. - Arguments: + Parameters problem_types (ProblemTypes or str): binary, multiclass, or regression Returns: @@ -58,7 +58,7 @@ def get_estimators(problem_type, model_families=None): Can also optionally filter by a list of model types. - Arguments: + Parameters problem_type (ProblemTypes or str): problem type to filter for model_families (list[ModelFamily] or list[str]): model families to filter for @@ -102,7 +102,7 @@ def handle_component_class(component_class): return a new instance. Otherwise if a ComponentBase subclass or Component instance is provided, will return that without modification. - Arguments: + Parameters component (str, ComponentBase): input to be standardized Returns: @@ -133,7 +133,7 @@ class WrappedSKClassifier(BaseEstimator, ClassifierMixin): def __init__(self, pipeline): """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline. - Arguments: + Parameters pipeline (PipelineBase or subclass obj): EvalML pipeline """ self.pipeline = pipeline @@ -146,7 +146,7 @@ def __init__(self, pipeline): def fit(self, X, y): """Fits component to data. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] @@ -163,7 +163,7 @@ def fit(self, X, y): def predict(self, X): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame): Features Returns: @@ -176,7 +176,7 @@ def predict(self, X): def predict_proba(self, X): """Make probability estimates for labels. - Arguments: + Parameters X (pd.DataFrame): Features Returns: @@ -191,7 +191,7 @@ class WrappedSKRegressor(BaseEstimator, RegressorMixin): def __init__(self, pipeline): """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline. - Arguments: + Parameters pipeline (PipelineBase or subclass obj): EvalML pipeline """ self.pipeline = pipeline @@ -203,7 +203,7 @@ def __init__(self, pipeline): def fit(self, X, y): """Fits component to data. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] @@ -216,7 +216,7 @@ def fit(self, X, y): def predict(self, X): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame): Features Returns: @@ -260,7 +260,7 @@ def scikit_learn_wrapped_estimator(evalml_obj): def generate_component_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML component. - Arguments: + Parameters element (component instance): The instance of the component to generate string Python code for Returns: @@ -295,7 +295,7 @@ def generate_component_code(element): def make_balancing_dictionary(y, sampling_ratio): """Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is. - Arguments: + Parameters y (pd.Series): Target data sampling_ratio (float): The balanced ratio we want the samples to meet diff --git a/evalml/pipelines/multiclass_classification_pipeline.py b/evalml/pipelines/multiclass_classification_pipeline.py index 3efe301a9c..fd6af206c4 100644 --- a/evalml/pipelines/multiclass_classification_pipeline.py +++ b/evalml/pipelines/multiclass_classification_pipeline.py @@ -5,7 +5,7 @@ class MulticlassClassificationPipeline(ClassificationPipeline): """Pipeline subclass for all multiclass classification pipelines. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 455e37fc59..9354c7d629 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -40,7 +40,7 @@ class PipelineBase(ABC, metaclass=PipelineBaseMeta): """Machine learning pipeline made out of transformers and an Estimator. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -204,7 +204,7 @@ def __setitem__(self, index, value): def get_component(self, name): """Returns component by name. - Arguments: + Parameters name (str): Name of component Returns: @@ -215,7 +215,7 @@ def get_component(self, name): def describe(self, return_dict=False): """Outputs pipeline details including component parameters. - Arguments: + Parameters return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. Returns: @@ -249,7 +249,7 @@ def describe(self, return_dict=False): def compute_estimator_features(self, X, y=None): """Transforms the data by applying all pre-processing components. - Arguments: + Parameters X (pd.DataFrame): Input data to the pipeline to transform. Returns: @@ -266,7 +266,7 @@ def _fit(self, X, y): def fit(self, X, y): """Build a model. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training data of length [n_samples]. @@ -277,7 +277,7 @@ def fit(self, X, y): def transform(self, X, y=None): """Transform the input. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. @@ -289,7 +289,7 @@ def transform(self, X, y=None): def predict(self, X, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. objective (Object or string): The objective to use to make predictions. @@ -305,7 +305,7 @@ def predict(self, X, objective=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. @@ -323,7 +323,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): Will raise a PipelineScoreError if any objectives fail. - Arguments: + Parameters X (pd.DataFrame): The feature matrix. y (pd.Series): The target data. y_pred (pd.Series): The pipeline predictions. @@ -413,7 +413,7 @@ def feature_importance(self): def graph(self, filepath=None): """Generate an image representing the pipeline graph. - Arguments: + Parameters filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. Returns: @@ -468,7 +468,7 @@ def graph(self, filepath=None): def graph_feature_importance(self, importance_threshold=0): """Generate a bar graph of the pipeline's feature importance. - Arguments: + Parameters importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. Returns: @@ -515,7 +515,7 @@ def graph_feature_importance(self, importance_threshold=0): def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves pipeline at file path. - Arguments: + Parameters file_path (str): location to save file pickle_protocol (int): the pickle data stream format. @@ -529,7 +529,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): def load(file_path): """Loads pipeline at file path. - Arguments: + Parameters file_path (str): location to load file Returns: @@ -554,7 +554,7 @@ def clone(self): def new(self, parameters, random_seed=0): """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. Not to be confused with python's __new__ method. - Arguments: + Parameters parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -677,7 +677,7 @@ def create_objectives(objectives): def can_tune_threshold_with_objective(self, objective): """Determine whether the threshold of a binary classification pipeline can be tuned. - Arguments: + Parameters pipeline (PipelineBase): Binary classification pipeline. objective (ObjectiveBase): Primary AutoMLSearch objective. @@ -695,7 +695,7 @@ def inverse_transform(self, y): Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). - Arguments: + Parameters y (pd.Series): Final component features """ return self.component_graph.inverse_transform(y) @@ -703,7 +703,7 @@ def inverse_transform(self, y): def get_hyperparameter_ranges(self, custom_hyperparameters): """Returns hyperparameter ranges from all components as a dictionary. - Arguments: + Parameters custom_hyperparameters (dict): Custom hyperparameters for the pipeline. Returns: diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 5a470367a6..46e7b6c722 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -6,7 +6,7 @@ class RegressionPipeline(PipelineBase): """Pipeline subclass for all regression pipelines. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -24,7 +24,7 @@ class RegressionPipeline(PipelineBase): def fit(self, X, y): """Build a regression model. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training data of length [n_samples] @@ -42,7 +42,7 @@ def fit(self, X, y): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): True values of length [n_samples] objectives (list): Non-empty list of objectives to score on diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 2580e0d816..245ade1656 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -18,7 +18,7 @@ class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPipeline): """Pipeline base class for time series classification problems. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -34,7 +34,7 @@ class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPip def fit(self, X, y): """Fit a time series classification pipeline. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training targets of length [n_samples] @@ -71,7 +71,7 @@ def _predict(self, X, y, objective=None, pad=False): def predict(self, X, y=None, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. @@ -98,7 +98,7 @@ def predict(self, X, y=None, objective=None): def predict_proba(self, X, y=None): """Make probability estimates for labels. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. Returns: @@ -120,7 +120,7 @@ def predict_proba(self, X, y=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. @@ -153,7 +153,7 @@ class TimeSeriesBinaryClassificationPipeline( ): """Pipeline base class for time series binary classification problems. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -207,7 +207,7 @@ def _score(X, y, predictions, objective): class TimeSeriesMulticlassClassificationPipeline(TimeSeriesClassificationPipeline): """Pipeline base class for time series multiclass classification problems. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index e9012068ed..7aa0eeffa7 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -9,7 +9,7 @@ class TimeSeriesPipelineBase(PipelineBase, metaclass=PipelineBaseMeta): """Pipeline base class for time series problems. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -56,7 +56,7 @@ def _convert_to_woodwork(X, y): def fit(self, X, y): """Fit a time series pipeline. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training targets of length [n_samples]. diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 435d70f53d..68d5f9b228 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -10,7 +10,7 @@ class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase): """Pipeline base class for time series regression problems. - Arguments: + Parameters component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -29,7 +29,7 @@ class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase): def predict(self, X, y=None, objective=None): """Make predictions using selected features. - Arguments: + Parameters X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. @@ -56,7 +56,7 @@ def predict(self, X, y=None, objective=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Arguments: + Parameters X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index dbbbf9279f..7afface60b 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -56,17 +56,19 @@ def _get_preprocessing_components( X, y, problem_type, estimator_class, sampler_name=None ): - """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. + """ + Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. - Arguments: - X (pd.DataFrame): The input data of shape [n_samples, n_features] - y (pd.Series): The target data of length [n_samples] - problem_type (ProblemTypes or str): Problem type - estimator_class (class): A class which subclasses Estimator estimator for pipeline, - sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None + Parameters + ---------- + X (pd.DataFrame): The input data of shape [n_samples, n_features]. + y (pd.Series): The target data of length [n_samples]. + problem_type (ProblemTypes or str): Problem type. + estimator_class (class): A class which subclasses Estimator estimator for pipeline. + sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None. Returns: - list[Transformer]: A list of applicable preprocessing components to use with the estimator + list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ pp_components = [] @@ -187,21 +189,24 @@ def make_pipeline( sampler_name=None, extra_components=None, ): - """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. - - Arguments: - X (pd.DataFrame): The input data of shape [n_samples, n_features] - y (pd.Series): The target data of length [n_samples] - estimator (Estimator): Estimator for pipeline - problem_type (ProblemTypes or str): Problem type for pipeline to generate + """ + Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. + + Parameters + ---------- + X (pd.DataFrame): The input data of shape [n_samples, n_features]. + y (pd.Series): The target data of length [n_samples]. + estimator (Estimator): Estimator for pipeline. + problem_type (ProblemTypes or str): Problem type for pipeline to generate. parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. Defaults to None extra_components (list(ComponentBase)): List of extra components to be added after preprocessing components. Defaults to None. - Returns: - PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator + Returns + ------- + PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -231,7 +236,7 @@ def make_pipeline( def generate_pipeline_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline. - Arguments: + Parameters element (pipeline instance): The instance of the pipeline to generate string Python code Returns: @@ -260,7 +265,7 @@ def _make_stacked_ensemble_pipeline( ): """Creates a pipeline with a stacked ensemble estimator. - Arguments: + Parameters input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. problem_type (ProblemType): problem type of pipeline @@ -315,7 +320,7 @@ def _make_stacked_ensemble_pipeline( def _make_component_list_from_actions(actions): """Creates a list of components from the input DataCheckAction list. - Arguments: + Parameters actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components Returns: diff --git a/evalml/preprocessing/__init__.py b/evalml/preprocessing/__init__.py index fa728de2f2..0b38f79f89 100644 --- a/evalml/preprocessing/__init__.py +++ b/evalml/preprocessing/__init__.py @@ -1,3 +1,4 @@ +"""Preprocessing utilities.""" from .utils import ( load_data, split_data, diff --git a/evalml/preprocessing/data_splitters/__init__.py b/evalml/preprocessing/data_splitters/__init__.py index 2e2f435f79..8007f946df 100644 --- a/evalml/preprocessing/data_splitters/__init__.py +++ b/evalml/preprocessing/data_splitters/__init__.py @@ -1,3 +1,4 @@ +"""Data splitter classes.""" from .training_validation_split import TrainingValidationSplit from .time_series_split import TimeSeriesSplit from .balanced_classification_sampler import BalancedClassificationSampler diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py index bd89df7592..81c46505a3 100644 --- a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py +++ b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py @@ -9,7 +9,7 @@ class BalancedClassificationSampler(SamplerBase): """ Class for balanced classification downsampler. - Arguments: + Parameters --------- sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. @@ -56,11 +56,11 @@ def _find_ideal_samples(self, y): """ Return dictionary of examples to drop for each class if we need to resample. - Arguments: + Parameters --------- y (pd.Series): Target data passed in. - Returns: + Returns ------- (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. @@ -93,11 +93,11 @@ def _sampling_dict_to_remove_dict(self, y): """ Turn the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. - Arguments: + Parameters --------- y (pd.Series): Training data targets. - Returns: + Returns ------- (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. @@ -112,12 +112,12 @@ def fit_resample(self, X, y): """ Resampling technique for this sampler. - Arguments: + Parameters --------- X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. - Returns: + Returns ------- list: Indices to keep for training data. """ diff --git a/evalml/preprocessing/data_splitters/sampler_base.py b/evalml/preprocessing/data_splitters/sampler_base.py index 780b3dc1ac..db554cadcf 100644 --- a/evalml/preprocessing/data_splitters/sampler_base.py +++ b/evalml/preprocessing/data_splitters/sampler_base.py @@ -6,7 +6,7 @@ class SamplerBase(ABC): """ Base class for all custom samplers. - Arguments: + Parameters --------- random_seed (int): The seed to use for random sampling. Defaults to 0. @@ -20,12 +20,12 @@ def fit_resample(self, X, y): """ Resample the input data with this sampling strategy. - Arguments: + Parameters --------- X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. - Returns: + Returns ------- Tuple(pd.DataFrame, pd.Series) or list: resampled X and y data for oversampling or indices to keep for undersampling. diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index a57db23dea..5c3c8123a5 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -13,7 +13,7 @@ class TimeSeriesSplit(BaseCrossValidator): desired amount. If the data that will be split already has all the features and appropriate target values, and then set max_delay and gap to 0. - Arguments: + Parameters --------- max_delay (int): Max delay value for feature engineering. Time series pipelines create delayed features from existing features. This process will introduce NaNs into the first max_delay number of rows. The @@ -47,7 +47,7 @@ def split(self, X, y=None, groups=None): This method can handle passing in empty or None X and y data but note that X and y cannot be None or empty at the same time. - Arguments: + Parameters --------- X (pd.DataFrame, None): Features to split. y (pd.DataFrame, None): Target variable to split. Defaults to None. diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index dfded94aa4..de0af50e8b 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -1,3 +1,4 @@ +"""Training Validation Split class.""" import numpy as np from sklearn.model_selection import train_test_split from sklearn.model_selection._split import BaseCrossValidator @@ -6,7 +7,8 @@ class TrainingValidationSplit(BaseCrossValidator): """Split the training data into training and validation sets. - Arguments: + Parameters + --------- test_size (float): What percentage of data points should be included in the validation set. Defalts to the complement of `train_size` if `train_size` is set, and 0.25 otherwise. train_size (float): What percentage of data points should be included in the training set. @@ -39,12 +41,12 @@ def get_n_splits(): def split(self, X, y=None): """Divide the data into training and testing sets. - Arguments: + Parameters --------- X (pd.DataFrame): Dataframe of points to split y (pd.Series): Series of points to split - Returns: + Returns ------- list: Indices to split data into training and test set """ diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index e4546c904f..47f930df12 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -1,3 +1,4 @@ +"""Helpful preprocessing utilities.""" import pandas as pd from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit @@ -14,7 +15,7 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg """ Load features and target from file. - Arguments: + Parameters --------- path (str): Path to file or a http/ftp/s3 URL. index (str): Column for index. @@ -23,12 +24,10 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg drop (list): List of columns to drop. Defaults to None. verbose (bool): If True, prints information about features and target. Defaults to True. - Returns: + Returns ------- pd.DataFrame, pd.Series: Features matrix and target. - """ - feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs) targets = [target] + (drop or []) @@ -52,9 +51,9 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg def split_data( X, y, problem_type, problem_configuration=None, test_size=0.2, random_seed=0 ): - """Splits data into train and test sets. + """Split data into train and test sets. - Arguments: + Parameters --------- X (pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): target data of length [n_samples] @@ -64,7 +63,8 @@ def split_data( test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. - Returns: + Returns + ------- pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. """ @@ -99,11 +99,11 @@ def number_of_features(dtypes): """ Get the number of features of each specific dtype in a DataFrame. - Arguments: + Parameters --------- dtypes (pd.Series): DataFrame.dtypes to get the number of features for. - Returns: + Returns ------- pd.Series: dtypes and the number of features for each input type. @@ -125,11 +125,11 @@ def target_distribution(targets): """ Get the target distributions. - Arguments: + Parameters --------- targets (pd.Series): Target data. - Returns: + Returns ------- pd.Series: Target data and their frequency distribution as percentages. """ @@ -139,14 +139,14 @@ def target_distribution(targets): def drop_nan_target_rows(X, y): """ - Drops rows in X and y when row in the target y has a value of NaN. + Drop rows in X and y when row in the target y has a value of NaN. - Arguments: + Parameters --------- X (pd.DataFrame, np.ndarray): Data to transform. y (pd.Series, np.ndarray): Target data. - Returns: + Returns ------- pd.DataFrame, pd.DataFrame: Transformed X (and y, if passed in) with rows that had a NaN value removed. """ diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 7aecc9f983..f423b4239c 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -7,7 +7,7 @@ def handle_problem_types(problem_type): """Handles problem_type by either returning the ProblemTypes or converting from a str. - Arguments: + Parameters problem_type (str or ProblemTypes): Problem type that needs to be handled Returns: @@ -29,7 +29,7 @@ def handle_problem_types(problem_type): def detect_problem_type(y): """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) Ignores missing and null data. - Arguments: + Parameters y (pd.Series): the target labels to predict Returns: @@ -55,7 +55,7 @@ def detect_problem_type(y): def is_regression(problem_type): """Determines if the provided problem_type is a regression problem type. - Arguments: + Parameters problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: @@ -70,7 +70,7 @@ def is_regression(problem_type): def is_binary(problem_type): """Determines if the provided problem_type is a binary classification problem type. - Arguments: + Parameters problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: @@ -85,7 +85,7 @@ def is_binary(problem_type): def is_multiclass(problem_type): """Determines if the provided problem_type is a multiclass classification problem type. - Arguments: + Parameters problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: @@ -100,7 +100,7 @@ def is_multiclass(problem_type): def is_classification(problem_type): """Determines if the provided problem_type is a classification problem type. - Arguments: + Parameters problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: @@ -112,7 +112,7 @@ def is_classification(problem_type): def is_time_series(problem_type): """Determines if the provided problem_type is a time series problem type. - Arguments: + Parameters problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 095c2d68eb..44f142812d 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1046,7 +1046,7 @@ def _imbalanced_data_X_y(problem_type, categorical_columns, size): For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. - Arguments: + Parameters problem_type (str): Either 'binary' or 'multiclass' categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 @@ -1106,7 +1106,7 @@ class _AutoMLTestEnv: def __init__(self, problem_type): """Create a test environment. - Arguments: + Parameters problem_type (str): The problem type corresponding to the search class you want to test. Attributes: @@ -1206,7 +1206,7 @@ def test_context( ): """A context manager for creating an environment that patches time-consuming pipeline methods. Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. - Arguments: + Parameters score_return_value: Passed as the return_value argument of the pipeline.score patch. mock_score_side_effect: Passed as the side_effect argument of the pipeline.score patch. Takes precedence over score_return_value. diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index 78585aebcc..2abdc0e644 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -9,7 +9,7 @@ class GridSearchTuner(Tuner): """Grid Search Optimizer, which generates all of the possible points to search for using a grid. - Arguments: + Parameters pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters n_points (int): The number of points to sample from along each dimension defined in the ``space`` argument. Defaults to 10. @@ -57,7 +57,7 @@ def __init__(self, pipeline_hyperparameter_ranges, n_points=10, random_seed=0): def add(self, pipeline_parameters, score): """Not applicable to grid search tuner as generated parameters are not dependent on scores of previous parameters. - Arguments: + Parameters pipeline_parameters (dict): a dict of the parameters used to evaluate a pipeline score (float): the score obtained by evaluating the pipeline with the provided parameters """ diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index 7cbf25dfa9..965e9b23be 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -7,7 +7,7 @@ class RandomSearchTuner(Tuner): """Random Search Optimizer. - Arguments: + Parameters pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters with_replacement (bool): If false, only unique hyperparameters will be shown replacement_max_attempts (int): The maximum number of tries to get a unique @@ -41,7 +41,7 @@ def __init__( def add(self, pipeline_parameters, score): """Not applicable to random search tuner as generated parameters are not dependent on scores of previous parameters. - Arguments: + Parameters pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline score (float): The score obtained by evaluating the pipeline with the provided parameters """ diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 7a8626c7e1..71935e8a66 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -14,7 +14,7 @@ class SKOptTuner(Tuner): """Bayesian Optimizer. - Arguments: + Parameters pipeline_hyperparameter_ranges (dict): A set of hyperparameter ranges corresponding to a pipeline's parameters. random_seed (int): The seed for the random number generator. Defaults to 0. """ @@ -31,7 +31,7 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): def add(self, pipeline_parameters, score): """Add score to sample. - Arguments: + Parameters pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline score (float): The score obtained by evaluating the pipeline with the provided parameters diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 1143237d7b..3e5d776133 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -8,7 +8,7 @@ class Tuner(ABC): Tuners implement different strategies for sampling from a search space. They're used in EvalML to search the space of pipeline hyperparameters. - Arguments: + Parameters pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters. random_seed (int): The random state. Defaults to 0. """ @@ -92,7 +92,7 @@ def _convert_to_pipeline_parameters(self, flat_parameters): def add(self, pipeline_parameters, score): """Register a set of hyperparameters with the score obtained from training a pipeline with those hyperparameters. - Arguments: + Parameters pipeline_parameters (dict): a dict of the parameters used to evaluate a pipeline score (float): the score obtained by evaluating the pipeline with the provided parameters diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index b664f37637..97eaf61078 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -20,7 +20,7 @@ def import_or_raise(library, error_msg=None, warning=False): """Attempts to import the requested library by name. If the import fails, raises an ImportError or warning. - Arguments: + Parameters library (str): the name of the library error_msg (str): error message to return if the import fails warning (bool): if True, import_or_raise gives a warning instead of ImportError. Defaults to False. @@ -75,7 +75,7 @@ def convert_to_seconds(input_str): def get_random_state(seed): """Generates a numpy.random.RandomState instance using seed. - Arguments: + Parameters seed (None, int, np.random.RandomState object): seed to use to generate numpy.random.RandomState. Must be between SEED_BOUNDS.min_bound and SEED_BOUNDS.max_bound, inclusive. Otherwise, an exception will be thrown. """ if isinstance(seed, (int, np.integer)) and ( @@ -96,7 +96,7 @@ def get_random_seed( To protect against invalid input to a particular library's random number generator, if an int value is provided, and it is outside the bounds "[min_bound, max_bound)", the value will be projected into the range between the min_bound (inclusive) and max_bound (exclusive) using modular arithmetic. - Arguments: + Parameters random_state (int, numpy.random.RandomState): random state min_bound (None, int): if not default of None, will be min bound when generating seed (inclusive). Must be less than max_bound. max_bound (None, int): if not default of None, will be max bound when generating seed (exclusive). Must be greater than min_bound. @@ -149,7 +149,7 @@ def __get__(self, _, klass): def _get_subclasses(base_class): """Gets all of the leaf nodes in the hiearchy tree for a given base class. - Arguments: + Parameters base_class (abc.ABCMeta): Class to find all of the children for. Returns: @@ -187,7 +187,7 @@ def _get_subclasses(base_class): def get_importable_subclasses(base_class, used_in_automl=True): """Get importable subclasses of a base class. Used to list all of our estimators, transformers, components and pipelines dynamically. - Arguments: + Parameters base_class (abc.ABCMeta): Base class to find all of the subclasses for. args (list): Args used to instantiate the subclass. [{}] for a pipeline, and [] for all other classes. @@ -222,7 +222,7 @@ def get_importable_subclasses(base_class, used_in_automl=True): def _rename_column_names_to_numeric(X, flatten_tuples=True): """Used in LightGBM and XGBoost estimator classes to rename column names when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that these estimators cannot natively handle. - Arguments: + Parameters X (pd.DataFrame): The input training data of shape [n_samples, n_features] flatten_tuples (bool): Whether to flatten MultiIndex or tuple column names. LightGBM cannot handle columns with tuple names. @@ -250,7 +250,7 @@ def _rename_column_names_to_numeric(X, flatten_tuples=True): def jupyter_check(): """Get whether or not the code is being run in a Ipython environment (such as Jupyter Notebook or Jupyter Lab) - Arguments: + Parameters None Returns: @@ -266,7 +266,7 @@ def jupyter_check(): def safe_repr(value): """Convert the given value into a string that can safely be used for repr. - Arguments: + Parameters value: the item to convert Returns: @@ -283,7 +283,7 @@ def safe_repr(value): def is_all_numeric(df): """Checks if the given DataFrame contains only numeric values. - Arguments: + Parameters df (pd.DataFrame): The DataFrame to check data types of. Returns: @@ -301,7 +301,7 @@ def is_all_numeric(df): def pad_with_nans(pd_data, num_to_pad): """Pad the beginning num_to_pad rows with nans. - Arguments: + Parameters pd_data (pd.DataFrame or pd.Series): Data to pad. Returns: @@ -326,7 +326,7 @@ def pad_with_nans(pd_data, num_to_pad): def _get_rows_without_nans(*data): """Compute a boolean array marking where all entries in the data are non-nan. - Arguments: + Parameters *data (sequence of pd.Series or pd.DataFrame) Returns: @@ -351,7 +351,7 @@ def _not_nan(pd_data): def drop_rows_with_nans(*pd_data): """Drop rows that have any NaNs in all dataframes or series. - Arguments: + Parameters *pd_data (sequence of pd.Series or pd.DataFrame or None) Returns: @@ -371,7 +371,7 @@ def _subset(pd_data): def _file_path_check(filepath=None, format="png", interactive=False, is_plotly=False): """Helper function to check the filepath being passed. - Arguments: + Parameters filepath (str or Path, optional): Location to save file. format (str): Extension for figure to be saved as. Defaults to 'png'. interactive (bool, optional): If True and fig is of type plotly.Figure, sets the format to 'html'. @@ -406,7 +406,7 @@ def save_plot( ): """Saves fig to filepath if specified, or to a default location if not. - Arguments: + Parameters fig (Figure): Figure to be saved. filepath (str or Path, optional): Location to save file. Default is with filename "test_plot". format (str): Extension for figure to be saved as. Ignored if interactive is True and fig @@ -474,7 +474,7 @@ def save_plot( def deprecate_arg(old_arg, new_arg, old_value, new_value): """Helper to raise warnings when a deprecated arg is used. - Arguments: + Parameters old_arg (str): Name of old/deprecated argument. new_arg (str): Name of new argument. old_value (Any): Value the user passed in for the old argument. diff --git a/evalml/utils/logger.py b/evalml/utils/logger.py index b5f2a6adcc..5c716cfe95 100644 --- a/evalml/utils/logger.py +++ b/evalml/utils/logger.py @@ -59,7 +59,7 @@ def log_subtitle(logger, title, underline="="): def time_elapsed(start_time): """How much time has elapsed since the search started. - Arguments: + Parameters start_time (int): Time when search started. Returns: diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index b4cbaee625..fa995f5ffc 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -47,7 +47,7 @@ def _raise_value_error_if_nullable_types_detected(data): def infer_feature_types(data, feature_types=None): """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork. - Arguments: + Parameters data (pd.DataFrame, pd.Series): Input data to convert to a Woodwork data structure. feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be @@ -114,7 +114,7 @@ def _retain_custom_types_and_initalize_woodwork( ): """Helper method which will take an old Woodwork data structure and a new pandas data structure and return a new data structure that will try to retain as many logical types from the old data structure that exist in the new pandas data structure as possible. - Arguments: + Parameters old_logical_types (Dict): Logical types to try to retain. new_dataframe (pd.DataFrame): Pandas data structure ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataFrame that have a logical type @@ -151,7 +151,7 @@ def _retain_custom_types_and_initalize_woodwork( def _convert_numeric_dataset_pandas(X, y): """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. Used with data sampler strategies. - Arguments: + Parameters X (pd.DataFrame, np.ndarray): Data to transform y (pd.Series, np.ndarray): Target data diff --git a/requirements.txt b/requirements.txt index d8c17d94d2..4130f5d7b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -r core-requirements.txt -plotly>=5.0.0 +plotly==5.0.0 kaleido>=0.1.0 ipywidgets>=7.5 xgboost>=1.4.2 From 677385cf950d8e01619d1ce1bbd3e6dbe392d96c Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sun, 29 Aug 2021 22:21:06 -0400 Subject: [PATCH 12/62] fix data checks --- Makefile | 2 +- .../data_checks/class_imbalance_data_check.py | 12 ++++------ evalml/data_checks/data_check.py | 8 +++---- evalml/data_checks/data_check_action.py | 4 +--- evalml/data_checks/data_check_action_code.py | 1 + evalml/data_checks/data_check_message.py | 4 +--- evalml/data_checks/data_checks.py | 5 +---- .../data_checks/datetime_format_data_check.py | 12 +++++----- evalml/data_checks/datetime_nan_data_check.py | 4 ++-- evalml/data_checks/default_data_checks.py | 4 +--- evalml/data_checks/highly_null_data_check.py | 15 +++++-------- evalml/data_checks/id_columns_data_check.py | 16 +++++--------- .../data_checks/invalid_targets_data_check.py | 7 +++--- .../multicollinearity_data_check.py | 7 +++--- .../natural_language_nan_data_check.py | 6 ++--- evalml/data_checks/no_variance_data_check.py | 8 +++---- evalml/data_checks/outliers_data_check.py | 22 +++++++++---------- evalml/data_checks/sparsity_data_check.py | 19 ++++++---------- .../target_distribution_data_check.py | 9 ++++---- .../data_checks/target_leakage_data_check.py | 15 +++++-------- evalml/data_checks/uniqueness_data_check.py | 16 +++++--------- evalml/data_checks/utils.py | 6 ++--- evalml/pipelines/component_graph.py | 2 +- evalml/pipelines/pipeline_base.py | 2 +- evalml/problem_types/utils.py | 2 +- evalml/tests/conftest.py | 2 +- evalml/tuners/grid_search_tuner.py | 2 +- evalml/tuners/random_search_tuner.py | 2 +- evalml/utils/gen_utils.py | 2 +- 29 files changed, 87 insertions(+), 129 deletions(-) diff --git a/Makefile b/Makefile index 82c0957ba0..d063a09f19 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml --ignore=D107,D203,D212 + pydocstyle evalml --convention=numpy black evalml -t py39 --check .PHONY: lint-fix diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 76ff6765b7..77ed5af741 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -13,8 +13,7 @@ class ClassImbalanceDataCheck(DataCheck): - """ - Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. + """Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. Arguments --------- @@ -25,7 +24,6 @@ class ClassImbalanceDataCheck(DataCheck): min_samples (int): The minimum number of samples per accepted class. If the minority class is both below the threshold and min_samples, then we consider this severely imbalanced. Must be greater than 0. Defaults to 100. num_cv_folds (int): The number of cross-validation folds. Must be positive. Choose 0 to ignore this warning. Defaults to 3. - """ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): @@ -50,8 +48,7 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): self.cv_folds = num_cv_folds * 2 def validate(self, X, y): - """ - Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. + """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. Arguments --------- @@ -63,8 +60,8 @@ def validate(self, X, y): dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. - Example - ------- + Examples + -------- >>> import pandas as pd >>> X = pd.DataFrame() >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) @@ -85,7 +82,6 @@ def validate(self, X, y): ... "code": "CLASS_IMBALANCE_SEVERE", ... "details": {"target_values": [0]}}], ... "actions": []} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/data_check.py b/evalml/data_checks/data_check.py index 86dfbedaa4..418e07a99a 100644 --- a/evalml/data_checks/data_check.py +++ b/evalml/data_checks/data_check.py @@ -6,8 +6,7 @@ class DataCheck(ABC): - """ - Base class for all data checks. + """Base class for all data checks. Data checks are a set of heuristics used to determine if there are problems with input data. @@ -15,13 +14,12 @@ class DataCheck(ABC): @classproperty def name(cls): - """Returns a name describing the data check.""" + """Return a name describing the data check.""" return str(cls.__name__) @abstractmethod def validate(self, X, y=None): - """ - Inspect and validate the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. + """Inspect and validate the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. Arguments --------- diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 03c80f58c0..52acd7fe02 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -2,14 +2,12 @@ class DataCheckAction: - """ - Recommended action returned by a DataCheck. + """Recommended action returned by a DataCheck. Arguments --------- action_code (DataCheckActionCode): Action code associated with the action. metadata (dict, optional): Additional useful information associated with the action. Defaults to None. - """ def __init__(self, action_code, metadata=None): diff --git a/evalml/data_checks/data_check_action_code.py b/evalml/data_checks/data_check_action_code.py index 74eaa9fdbb..0bd48fe324 100644 --- a/evalml/data_checks/data_check_action_code.py +++ b/evalml/data_checks/data_check_action_code.py @@ -1,3 +1,4 @@ +"""Enum for data check action code.""" from enum import Enum diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 1bab135d62..723555cb10 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -12,7 +12,6 @@ class DataCheckMessage: data_check_name (str): Name of data check. message_code (DataCheckMessageCode): Message code associated with message. Defaults to None. details (dict): Additional useful information associated with the message. Defaults to None. - """ message_type = None @@ -28,8 +27,7 @@ def __str__(self): return self.message def __eq__(self, other): - """ - Check for equality. + """Check for equality. Two DataCheckMessage objs are considered equivalent if all of their attributes are equivalent. diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 106ad7d992..29f696567e 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -24,7 +24,6 @@ class DataChecks: --------- data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. - """ @staticmethod @@ -88,8 +87,7 @@ def __init__(self, data_checks=None, data_check_params=None): self.data_checks = data_check_instances def validate(self, X, y=None): - """ - Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. + """Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. Arguments --------- @@ -99,7 +97,6 @@ def validate(self, X, y=None): Returns ------- dict: Dictionary containing DataCheckMessage objects - """ messages = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 665706282b..f185916543 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -1,5 +1,4 @@ -"""Data check that checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. -""" +"""Data check that checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators.""" import pandas as pd from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode @@ -7,8 +6,7 @@ class DateTimeFormatDataCheck(DataCheck): - """ - Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. + """Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. Parameters ---------- @@ -19,7 +17,7 @@ def __init__(self, datetime_column="index"): self.datetime_column = datetime_column def validate(self, X, y): - """Checks if the target data has equal intervals and is sorted. + """Check if the target data has equal intervals and is sorted. Parameters ---------- @@ -30,8 +28,8 @@ def validate(self, X, y): ------- dict (DataCheckError): List with DataCheckErrors if unequal intervals are found in the datetime column. - Example - ------- + Examples + -------- >>> from pandas as pd >>> X = pd.DataFrame(pd.date_range("January 1, 2021", periods=8), columns=["dates"]) >>> y = pd.Series([1, 2, 4, 2, 1, 2, 3, 1]) diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index dbef5b4d9a..55a2e9d60a 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -21,8 +21,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckError if NaN values are present in datetime columns. - Example - ------- + Examples + -------- >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index b71f2fc987..2d1c5cd7fb 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -19,8 +19,7 @@ class DefaultDataChecks(DataChecks): - """ - A collection of basic data checks that is used by AutoML by default. + """A collection of basic data checks that is used by AutoML by default. Includes: @@ -43,7 +42,6 @@ class DefaultDataChecks(DataChecks): n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. datetime_column (str): The name of the column containing datetime information to be used for time series problems. Default to "index" indicating that the datetime information is in the index of X or y. - """ _DEFAULT_DATA_CHECK_CLASSES = [ diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 554da323dd..9b0b1c1ce1 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -11,16 +11,14 @@ class HighlyNullDataCheck(DataCheck): - """ - Check if there are any highly-null columns and rows in the input. + """Check if there are any highly-null columns and rows in the input. Parameters - --------- + ---------- pct_null_col_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, that column will be considered highly-null. Defaults to 0.95. pct_null_row_threshold(float): If the percentage of NaN values in an input row exceeds this amount, that row will be considered highly-null. Defaults to 0.95. - """ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): @@ -37,11 +35,10 @@ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): self.pct_null_row_threshold = pct_null_row_threshold def validate(self, X, y=None): - """ - Check if there are any highly-null columns or rows in the input. + """Check if there are any highly-null columns or rows in the input. Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. @@ -49,8 +46,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckWarning if there are any highly-null columns or rows. - Example: - ------- + Examples + -------- >>> import pandas as pd >>> class SeriesWrap(): ... def __init__(self, series): diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index 18db3c07f8..d6bb90e782 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -10,13 +10,11 @@ class IDColumnsDataCheck(DataCheck): - """ - Check if any of the features are likely to be ID columns. + """Check if any of the features are likely to be ID columns. Parameters - --------- + ---------- id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0. - """ def __init__(self, id_threshold=1.0): @@ -25,15 +23,14 @@ def __init__(self, id_threshold=1.0): self.id_threshold = id_threshold def validate(self, X, y=None): - """ - Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. + """Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. - column name is "id" - column name ends in "_id" - column contains all unique values (and is categorical / integer type) Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): The input features to check.T y (pd.Series): The target. Defaults to None. Ignored. @@ -41,8 +38,8 @@ def validate(self, X, y=None): ------- dict: A dictionary of features with column name or index and their probability of being ID columns - Example: - ------- + Examples + -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'df_id': [0, 1, 2, 3, 4], @@ -59,7 +56,6 @@ def validate(self, X, y=None): ... "details": {"column": "df_id"}}], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "df_id"}}]} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 4deb3be466..711f6a25ab 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -24,8 +24,7 @@ class InvalidTargetDataCheck(DataCheck): - """ - Check if the target data contains missing or invalid values. + """Check if the target data contains missing or invalid values. Arguments --------- @@ -57,8 +56,8 @@ def validate(self, X, y): ------- dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. - Example - ------- + Examples + -------- >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 24c9f39d33..583db4ae16 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -11,7 +11,7 @@ class MulticollinearityDataCheck(DataCheck): """Check if any set features are likely to be multicollinear. Parameters - --------- + ---------- threshold (float): The threshold to be considered. Defaults to 0.9. """ @@ -24,7 +24,7 @@ def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. Parameters - --------- + ---------- X (pd.DataFrame): The input features to check. y (pd.Series): The target. Ignored. @@ -32,7 +32,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. - Example: + Examples + -------- >>> import pandas as pd >>> col = pd.Series([1, 0, 2, 3, 4]) >>> X = pd.DataFrame({"col_1": col, "col_2": col * 3}) diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 9f0daff27f..0789bc7e62 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -13,7 +13,7 @@ def validate(self, X, y=None): Check if any natural language columns contain NaN values. Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. @@ -21,8 +21,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckError if NaN values are present in natural language columns. - Example: - ------- + Examples + -------- >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index ed7259d213..57dfd926c9 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -18,7 +18,7 @@ class NoVarianceDataCheck(DataCheck): Check if the target or any of the features have no variance. Parameters - --------- + ---------- count_nan_as_value (bool): If True, missing values will be counted as their own unique value. Additionally, if true, will return a DataCheckWarning instead of an error if the feature has mostly missing data and only one unique value. @@ -34,7 +34,7 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): Check if a column has no variance. Parameters - --------- + ---------- column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. @@ -42,7 +42,6 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): Returns ------- DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. - """ message = f"{column_name} has {int(count_unique)} unique value." @@ -69,14 +68,13 @@ def validate(self, X, y): Check if the target or any of the features have no variance (1 unique value). Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. Returns ------- dict: dict of warnings/errors corresponding to features or target with no variance. - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 9a1b7a81fc..ee3f82e93f 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -18,20 +18,19 @@ class OutliersDataCheck(DataCheck): """ def validate(self, X, y=None): - """ - Check if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. + """Check if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. Parameters - --------- - X (pd.DataFrame, np.ndarray): Features - y (pd.Series, np.ndarray): Ignored. + ---------- + X (pd.DataFrame, np.ndarray): Input features. + y (pd.Series, np.ndarray): Ignored. Defaults to None. - Return - ------ + Returns + ------- dict: A dictionary with warnings if any columns have outliers. - Example - ------- + Examples + -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5], @@ -87,13 +86,12 @@ def validate(self, X, y=None): @staticmethod def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: - """ - Calculate the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. + """Calculate the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. Original credit goes to Jad Raad and Dan Putler of Alteryx. Parameters - --------- + ---------- num_records (int): The integer number of non-missing values in a column. pct_outliers (float): The percentage of potential outliers in a column. diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index 899c29af20..177604cee4 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -13,11 +13,10 @@ class SparsityDataCheck(DataCheck): - """ - Check if there are any columns with sparsely populated values in the input. + """Check if there are any columns with sparsely populated values in the input. Parameters - --------- + ---------- problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. threshold (float): The threshold value, or percentage of each column's unique values, @@ -25,7 +24,6 @@ class SparsityDataCheck(DataCheck): unique_count_threshold (int): The minimum number of times a unique value has to be present in a column to not be considered "sparse." Defaults to 10. - """ def __init__(self, problem_type, threshold, unique_count_threshold=10): @@ -40,11 +38,10 @@ def __init__(self, problem_type, threshold, unique_count_threshold=10): raise ValueError("Unique count threshold must be positive integer.") def validate(self, X, y=None): - """ - Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. + """Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. @@ -52,8 +49,8 @@ def validate(self, X, y=None): ------- dict: dict with a DataCheckWarning if there are any sparse columns. - Example: - ------- + Examples + -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], @@ -69,7 +66,6 @@ def validate(self, X, y=None): ... "details": {"column": "sparse", 'sparsity_score': 0.0}}], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "sparse"}}]} - """ results = {"warnings": [], "errors": [], "actions": []} @@ -108,7 +104,7 @@ def sparsity_score(col, count_threshold=10): Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. Parameters - --------- + ---------- col (pd.Series): Feature values. count_threshold (int): The number of instances below which a value is considered sparse. Default is 10. @@ -116,7 +112,6 @@ def sparsity_score(col, count_threshold=10): Returns ------- (float): Sparsity score, or the percentage of the unique values that exceed count_threshold. - """ counts = col.value_counts() score = sum(counts > count_threshold) / counts.size diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index 1a564a25ef..72ee5c1160 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -18,11 +18,10 @@ class TargetDistributionDataCheck(DataCheck): """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" def validate(self, X, y): - """ - Check if the target data has a certain distribution. + """Check if the target data has a certain distribution. Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. @@ -30,8 +29,8 @@ def validate(self, X, y): ------- dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. - Example: - ------- + Examples + -------- >>> from scipy.stats import lognorm >>> X = None >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 29aa3b2be1..b179f7a06b 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -16,18 +16,16 @@ class TargetLeakageDataCheck(DataCheck): - """ - Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. + """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, this data check uses mutual information and supports all target and feature types. Otherwise, if `method='pearson'`, it uses Pearson correlation and only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Parameters - --------- + ---------- pct_corr_threshold (float): The correlation threshold to be considered leakage. Defaults to 0.95. method (string): The method to determine correlation. Use 'mutual' for mutual information, otherwise 'pearson' for Pearson correlation. Defaults to 'mutual'. - """ def __init__(self, pct_corr_threshold=0.95, method="mutual"): @@ -70,14 +68,13 @@ def _calculate_mutual_information(self, X, y): return highly_corr_cols def validate(self, X, y): - """ - Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. + """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Parameters - --------- + ---------- X (pd.DataFrame, np.ndarray): The input features to check y (pd.Series, np.ndarray): The target data @@ -85,8 +82,8 @@ def validate(self, X, y): ------- dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. - Example: - ------- + Examples + -------- >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 3a65e536df..14c959e3c7 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -20,8 +20,7 @@ class UniquenessDataCheck(DataCheck): - """ - Check if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. + """Check if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. Arguments --------- @@ -38,8 +37,7 @@ def __init__(self, problem_type, threshold=0.50): self.threshold = threshold def validate(self, X, y=None): - """ - Check if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. + """Check if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Parameters ---------- @@ -51,8 +49,8 @@ def validate(self, X, y=None): dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. - Example: - ------- + Examples + -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], @@ -131,19 +129,17 @@ def validate(self, X, y=None): @staticmethod def uniqueness_score(col): - """ - Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. + """Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl–Hirschman Index. Parameters - --------- + ---------- col (pd.Series): Feature values. Returns ------- (float): Uniqueness score. - """ norm_counts = col.value_counts() / col.value_counts().sum() square_counts = norm_counts ** 2 diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index 9c996f07a6..d8d7e88190 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -3,13 +3,11 @@ class EmptyDataChecks(DataChecks): - """ - An empty collection of data checks. + """An empty collection of data checks. Parameters - --------- + ---------- data_checks (list (DataCheck)): Ignored. - """ def __init__(self, data_checks=None): diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 0e4f1543b1..ffa73d137e 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -32,7 +32,7 @@ class ComponentGraph: component_dict (dict): A dictionary which specifies the components and edges between components that should be used to create the component graph. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. - Example: + Examples >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']} >>> component_graph = ComponentGraph(component_dict) """ diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 9354c7d629..38b5699f5f 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -130,7 +130,7 @@ def name(self): def summary(self): """A short summary of the pipeline structure, describing the list of components used. - Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder + Examples Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ component_graph = [ type(self.component_graph.component_instances[component]) diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index f423b4239c..0f5237adc7 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -35,7 +35,7 @@ def detect_problem_type(y): Returns: ProblemType: ProblemType Enum - Example: + Examples >>> y = pd.Series([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1]) >>> problem_type = detect_problem_type(y) >>> assert problem_type == ProblemTypes.BINARY diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 44f142812d..9212dff764 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1093,7 +1093,7 @@ class _AutoMLTestEnv: tests that patch Estimator.fit instead of Pipeline.fit or tests that only want to patch a selective subset of the methods listed above. - Example: + Examples >>> env = _AutoMLTestEnv(problem_type="binary") >>> # run_search is short-hand for creating the context manager and then running search >>> # env.run_search(automl, score_return_value={automl.objective.name: 1.0}) diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index 2abdc0e644..414696724f 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -15,7 +15,7 @@ class GridSearchTuner(Tuner): defined in the ``space`` argument. Defaults to 10. random_seed (int): Seed for random number generator. Unused in this class, defaults to 0. - Example: + Examples >>> tuner = GridSearchTuner({'My Component': {'param a': [0.0, 10.0], 'param b': ['a', 'b', 'c']}}, n_points=5) >>> proposal = tuner.propose() >>> assert proposal.keys() == {'My Component'} diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index 965e9b23be..e6f3094ce9 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -15,7 +15,7 @@ class RandomSearchTuner(Tuner): with_replacement=True random_seed (int): Seed for random number generator. Defaults to 0. - Example: + Examples >>> tuner = RandomSearchTuner({'My Component': {'param a': [0.0, 10.0], 'param b': ['a', 'b', 'c']}}, random_seed=42) >>> proposal = tuner.propose() >>> assert proposal.keys() == {'My Component'} diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 97eaf61078..1e0ddd2d6d 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -120,7 +120,7 @@ def get_random_seed( class classproperty: """Allows function to be accessed as a class level property. - Example: + Examples .. code-block:: From 4674ea61c76bac75c39758c97091616b5cdd17df Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 00:19:29 -0400 Subject: [PATCH 13/62] cleanup more --- Makefile | 2 +- docs/source/user_guide/pipelines.ipynb | 2 +- evalml/automl/__init__.py | 1 + evalml/automl/automl_algorithm/__init__.py | 1 + .../automl_algorithm/automl_algorithm.py | 7 +- .../automl_algorithm/evalml_algorithm.py | 36 ++++---- .../automl_algorithm/iterative_algorithm.py | 7 +- evalml/automl/automl_search.py | 42 +++++----- evalml/automl/callbacks.py | 3 +- evalml/automl/engine/__init__.py | 1 + evalml/automl/engine/cf_engine.py | 84 +++++++++---------- evalml/automl/engine/dask_engine.py | 61 +++++++------- evalml/automl/engine/engine_base.py | 44 +++++----- evalml/automl/engine/sequential_engine.py | 31 ++++++- evalml/automl/pipeline_search_plots.py | 20 +++-- evalml/automl/utils.py | 20 ++--- .../data_checks/class_imbalance_data_check.py | 15 ++-- evalml/data_checks/data_check.py | 6 +- evalml/data_checks/data_check_action.py | 6 +- evalml/data_checks/data_check_message.py | 6 +- evalml/data_checks/data_checks.py | 14 ++-- .../data_checks/datetime_format_data_check.py | 13 +-- evalml/data_checks/datetime_nan_data_check.py | 9 +- evalml/data_checks/default_data_checks.py | 3 +- evalml/data_checks/highly_null_data_check.py | 15 ++-- evalml/data_checks/id_columns_data_check.py | 12 +-- .../data_checks/invalid_targets_data_check.py | 10 +-- .../multicollinearity_data_check.py | 12 +-- .../natural_language_nan_data_check.py | 11 +-- evalml/data_checks/no_variance_data_check.py | 25 ++---- evalml/data_checks/outliers_data_check.py | 18 ++-- evalml/data_checks/sparsity_data_check.py | 21 ++--- .../target_distribution_data_check.py | 11 +-- .../data_checks/target_leakage_data_check.py | 13 ++- evalml/data_checks/uniqueness_data_check.py | 18 ++-- evalml/data_checks/utils.py | 3 +- evalml/demos/breast_cancer.py | 3 +- evalml/demos/churn.py | 6 +- evalml/demos/diabetes.py | 5 +- evalml/demos/fraud.py | 6 +- evalml/demos/wine.py | 3 +- evalml/exceptions/__init__.py | 1 + evalml/exceptions/exceptions.py | 7 +- evalml/model_family/utils.py | 4 +- evalml/model_understanding/__init__.py | 1 + evalml/model_understanding/force_plots.py | 2 +- evalml/model_understanding/graphs.py | 52 ++++++------ .../permutation_importance.py | 5 +- .../prediction_explanations/__init__.py | 1 + .../prediction_explanations/_algorithms.py | 24 +++--- .../_report_creator_factory.py | 2 +- .../_user_interface.py | 23 +++-- .../prediction_explanations/explainers.py | 21 +++-- .../binary_classification_objective.py | 4 +- evalml/objectives/cost_benefit_matrix.py | 6 +- evalml/objectives/fraud_cost.py | 4 +- evalml/objectives/lead_scoring.py | 4 +- evalml/objectives/objective_base.py | 10 +-- evalml/objectives/sensitivity_low_alert.py | 6 +- evalml/objectives/utils.py | 12 +-- .../binary_classification_pipeline.py | 10 +-- .../binary_classification_pipeline_mixin.py | 2 +- evalml/pipelines/classification_pipeline.py | 22 ++--- evalml/pipelines/component_graph.py | 62 +++++++------- evalml/pipelines/components/component_base.py | 24 +++--- .../ensemble/sklearn_stacked_ensemble_base.py | 4 +- .../sklearn_stacked_ensemble_classifier.py | 2 +- .../sklearn_stacked_ensemble_regressor.py | 2 +- .../classifiers/baseline_classifier.py | 6 +- .../classifiers/catboost_classifier.py | 2 +- .../classifiers/decision_tree_classifier.py | 2 +- .../classifiers/elasticnet_classifier.py | 2 +- .../estimators/classifiers/et_classifier.py | 2 +- .../classifiers/kneighbors_classifier.py | 2 +- .../classifiers/lightgbm_classifier.py | 2 +- .../logistic_regression_classifier.py | 2 +- .../estimators/classifiers/rf_classifier.py | 2 +- .../estimators/classifiers/svm_classifier.py | 2 +- .../classifiers/xgboost_classifier.py | 2 +- .../components/estimators/estimator.py | 12 +-- .../estimators/regressors/arima_regressor.py | 2 +- .../regressors/baseline_regressor.py | 4 +- .../regressors/catboost_regressor.py | 2 +- .../regressors/decision_tree_regressor.py | 2 +- .../regressors/elasticnet_regressor.py | 2 +- .../estimators/regressors/et_regressor.py | 2 +- .../regressors/lightgbm_regressor.py | 2 +- .../estimators/regressors/linear_regressor.py | 2 +- .../regressors/prophet_regressor.py | 2 +- .../estimators/regressors/rf_regressor.py | 2 +- .../estimators/regressors/svm_regressor.py | 2 +- .../time_series_baseline_estimator.py | 4 +- .../regressors/xgboost_regressor.py | 2 +- .../transformers/column_selectors.py | 20 ++--- .../dimensionality_reduction/lda.py | 2 +- .../dimensionality_reduction/pca.py | 2 +- .../transformers/encoders/onehot_encoder.py | 14 ++-- .../transformers/encoders/target_encoder.py | 4 +- .../feature_selection/feature_selector.py | 8 +- .../rf_classifier_feature_selector.py | 2 +- .../rf_regressor_feature_selector.py | 2 +- .../transformers/imputers/imputer.py | 10 +-- .../imputers/per_column_imputer.py | 10 +-- .../transformers/imputers/simple_imputer.py | 14 ++-- .../transformers/imputers/target_imputer.py | 14 ++-- .../preprocessing/datetime_featurizer.py | 8 +- .../delayed_feature_transformer.py | 10 +-- .../preprocessing/drop_null_columns.py | 6 +- .../preprocessing/drop_rows_transformer.py | 2 +- .../preprocessing/featuretools.py | 10 +-- .../preprocessing/log_transformer.py | 6 +- .../transformers/preprocessing/lsa.py | 4 +- .../preprocessing/polynomial_detrender.py | 10 +-- .../preprocessing/text_featurizer.py | 6 +- .../preprocessing/text_transformer.py | 2 +- .../transform_primitive_components.py | 4 +- .../transformers/samplers/base_sampler.py | 18 ++-- .../transformers/samplers/oversamplers.py | 6 +- .../transformers/samplers/undersampler.py | 4 +- .../transformers/scalers/standard_scaler.py | 2 +- .../components/transformers/transformer.py | 14 ++-- evalml/pipelines/components/utils.py | 44 +++++----- .../multiclass_classification_pipeline.py | 2 +- evalml/pipelines/pipeline_base.py | 72 ++++++++-------- evalml/pipelines/regression_pipeline.py | 10 +-- .../time_series_classification_pipelines.py | 22 ++--- evalml/pipelines/time_series_pipeline_base.py | 6 +- .../time_series_regression_pipeline.py | 10 +-- evalml/pipelines/utils.py | 24 +++--- .../balanced_classification_sampler.py | 32 +++---- .../data_splitters/sampler_base.py | 17 ++-- .../data_splitters/time_series_split.py | 15 ++-- .../training_validation_split.py | 9 +- evalml/preprocessing/utils.py | 44 ++++------ evalml/problem_types/problem_types.py | 2 +- evalml/problem_types/utils.py | 30 +++---- evalml/tests/conftest.py | 8 +- .../data_checks_tests/test_data_checks.py | 2 +- .../test_explainers.py | 18 ++-- .../test_cost_benefit_matrix.py | 8 +- evalml/tuners/__init__.py | 1 + evalml/tuners/grid_search_tuner.py | 13 +-- evalml/tuners/random_search_tuner.py | 13 +-- evalml/tuners/skopt_tuner.py | 11 +-- evalml/tuners/tuner.py | 7 +- evalml/tuners/tuner_exceptions.py | 1 + evalml/utils/__init__.py | 1 + evalml/utils/base_meta.py | 3 + evalml/utils/cli_utils.py | 3 +- evalml/utils/gen_utils.py | 44 +++++----- evalml/utils/logger.py | 11 ++- evalml/utils/update_checker.py | 1 + evalml/utils/woodwork_utils.py | 13 +-- 153 files changed, 827 insertions(+), 895 deletions(-) diff --git a/Makefile b/Makefile index d063a09f19..eeff7163dc 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml --convention=numpy + pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 black evalml -t py39 --check .PHONY: lint-fix diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index af134fc1ab..3ec5ddb16b 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -151,7 +151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Pipeline Parameters\n", + "## Pipeline Args:\n", "\n", "You can also pass in custom parameters by using the `parameters` parameter, which will then be used when instantiating each component in `component_graph`. The parameters dictionary needs to be in the format of a two-layered dictionary where the key-value pairs are the component name and corresponding component parameters dictionary. The component parameters dictionary consists of (parameter name, parameter values) key-value pairs.\n", "\n", diff --git a/evalml/automl/__init__.py b/evalml/automl/__init__.py index ccfd30dcb5..80a4f30427 100644 --- a/evalml/automl/__init__.py +++ b/evalml/automl/__init__.py @@ -1,3 +1,4 @@ +"""AutoMLSearch and related modules.""" from .automl_search import AutoMLSearch, search from .utils import ( get_default_primary_search_objective, diff --git a/evalml/automl/automl_algorithm/__init__.py b/evalml/automl/automl_algorithm/__init__.py index 1ce1f54b51..f908250f5b 100644 --- a/evalml/automl/automl_algorithm/__init__.py +++ b/evalml/automl/automl_algorithm/__init__.py @@ -1,3 +1,4 @@ +"""AutoML algorithms that power EvalML.""" from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException from .iterative_algorithm import IterativeAlgorithm from .evalml_algorithm import EvalMLAlgorithm diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 87eb3bd112..2013973cf7 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -1,3 +1,4 @@ +"""Base class for the AutoML algorithms which power EvalML.""" from abc import ABC, abstractmethod from evalml.exceptions import PipelineNotFoundError @@ -17,7 +18,7 @@ class AutoMLAlgorithm(ABC): To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary. - Parameters + Args: allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. custom_hyperparameters (dict): Custom hyperparameter ranges specified for pipelines to iterate over. max_iterations (int): The maximum number of iterations to be evaluated. @@ -52,14 +53,14 @@ def __init__( def next_batch(self): """Get the next batch of pipelines to evaluate. - Returns: + Returns list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. - Parameters + Args: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_algorithm/evalml_algorithm.py b/evalml/automl/automl_algorithm/evalml_algorithm.py index 4fe6a2b86f..c8fbf2db3b 100644 --- a/evalml/automl/automl_algorithm/evalml_algorithm.py +++ b/evalml/automl/automl_algorithm/evalml_algorithm.py @@ -1,3 +1,4 @@ +"""An automl algorithm that consists of two modes: fast and long, where fast is a subset of long.""" import inspect import numpy as np @@ -48,6 +49,21 @@ class EvalMLAlgorithm(AutoMLAlgorithm): 8. Repeat these indefinitely until stopping criterion is met: a. For each of the previous top 3 estimators, sample 10 parameters from the tuner. Run all 30 in one batch b. Run ensembling + + Args: + X (pd.DataFrame): Training data. + y (pd.Series): Target data. + problem_type (ProblemType): Problem type associated with training data. + sampler_name (BaseSampler): Sampler to use for preprocessing. + tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. + random_seed (int): Seed for the random number generator. Defaults to 0. + pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. + custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None. + n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1. + text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. + top_n (int): top n number of pipelines to use for long mode. + num_long_explore_pipelines (int): number of pipelines to explore for each top n pipeline at the start of long mode. + num_long_pipelines_per_batch (int): number of pipelines per batch for each top n pipeline through long mode. """ def __init__( @@ -66,23 +82,6 @@ def __init__( num_long_explore_pipelines=50, num_long_pipelines_per_batch=10, ): - """ - Parameters - X (pd.DataFrame): Training data - y (pd.Series): Target data - problem_type (ProblemType): Problem type associated with training data - sampler_name (BaseSampler): Sampler to use for preprocessing - tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. - random_seed (int): Seed for the random number generator. Defaults to 0. - pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. - custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None. - n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1. - text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. - top_n (int): top n number of pipelines to use for long mode. - num_long_explore_pipelines (int): number of pipelines to explore for each top n pipeline at the start of long mode. - num_long_pipelines_per_batch (int): number of pipelines per batch for each top n pipeline through long mode. - """ - super().__init__( allowed_pipelines=[], custom_hyperparameters=custom_hyperparameters, @@ -270,7 +269,6 @@ def next_batch(self): Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ - if self._batch_number == 0: next_batch = self._create_naive_pipelines() elif self._batch_number == 1: @@ -295,7 +293,7 @@ def next_batch(self): def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. In batch number 2, the selected column names from the feature selector are taken to be used in a column selector. Information regarding the best pipeline is updated here as well. - Parameters + Args: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 0043ea7335..ef098d619e 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -1,3 +1,4 @@ +"""An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance.""" import inspect from operator import itemgetter @@ -24,7 +25,7 @@ class IterativeAlgorithm(AutoMLAlgorithm): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. - Parameters + Args: allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. @@ -56,7 +57,7 @@ def __init__( ): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. - Parameters + Args: allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. @@ -198,7 +199,7 @@ def next_batch(self): def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline. - Parameters + Args: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 3554546b6d..339d3bfef9 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -1,3 +1,4 @@ +"""EvalML's core AutoML object.""" import copy import pickle import sys @@ -81,7 +82,7 @@ def search( This method is provided for convenience. If you'd like more control over when each of these steps is run, consider making calls directly to the various pieces like the data checks and AutoMLSearch, instead of using this method. - Parameters + Args: X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. @@ -152,7 +153,7 @@ def search( class AutoMLSearch: """Automated Pipeline search. - Parameters + Args: X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. @@ -224,7 +225,7 @@ class AutoMLSearch: ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. If the number of unique pipelines to search over per batch is one, ensembling will not run. Defaults to False. - max_batches (int): The maximum number of batches of pipelines to search. Parameters max_time, and + max_batches (int): The maximum number of batches of pipelines to search. Args: max_time, and max_iterations have precedence over stopping the search. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, @@ -705,6 +706,7 @@ def _validate_objective(self, objective): return objective def __str__(self): + """Returns string representation of the AutoMLSearch object.""" def _print_list(obj_list): lines = sorted(["\t{}".format(o.name) for o in obj_list]) return "\n".join(lines) @@ -717,7 +719,7 @@ def _get_funct_name(function): search_desc = ( f"{handle_problem_types(self.problem_type).name} Search\n\n" - f"Parameters: \n{'='*20}\n" + f"Args:: \n{'='*20}\n" f"Objective: {get_objective(self.objective).name}\n" f"Max Time: {self.max_time}\n" f"Max Iterations: {self.max_iterations}\n" @@ -759,7 +761,7 @@ def _validate_problem_configuration(self, problem_configuration=None): def _handle_keyboard_interrupt(self): """Presents a prompt to the user asking if they want to stop the search. - Returns: + Returns bool: If True, search should terminate early """ leading_char = "\n" @@ -784,7 +786,7 @@ def _handle_keyboard_interrupt(self): def search(self, show_iteration_plot=True): """Find the best pipeline for the data set. - Parameters + Args: feature_types (list, optional): list of feature types, either numerical or categorical. Categorical features will automatically be encoded @@ -944,15 +946,15 @@ def _num_pipelines(self): """Return the number of pipeline evaluations which have been made. Returns: - int: the number of pipeline evaluations made in the search + int: The number of pipeline evaluations made in the search. """ return len(self._results["pipeline_results"]) def _should_continue(self): - """Given the original stopping criterion and current state, should the search continue? + """Given the original stopping criterion and current state, return whether or not the search should continue. Returns: - bool: True if yes, False if no. + bool: True if search should continue, False otherwise. """ if self._interrupted: return False @@ -1199,10 +1201,10 @@ def _check_for_high_variance(self, pipeline, cv_scores, threshold=0.5): def get_pipeline(self, pipeline_id): """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline initialized with the parameters used to train that pipeline during automl search. - Parameters + Args: pipeline_id (int): pipeline to retrieve - Returns: + Returns PipelineBase: untrained pipeline instance associated with the provided ID """ pipeline_results = self.results["pipeline_results"].get(pipeline_id) @@ -1219,12 +1221,12 @@ def get_pipeline(self, pipeline_id): def describe_pipeline(self, pipeline_id, return_dict=False): """Describe a pipeline. - Parameters + Args: pipeline_id (int): pipeline to describe return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. - Returns: + Returns Description of specified pipeline. Includes information such as type of pipeline components, problem, training time, cross validation, etc. """ @@ -1295,7 +1297,7 @@ def describe_pipeline(self, pipeline_id, return_dict=False): def add_to_rankings(self, pipeline): """Fits and evaluates a given pipeline then adds the results to the automl rankings with the requirement that automl search has been run. - Parameters + Args: pipeline (PipelineBase): pipeline to train and evaluate. """ pipeline_rows = self.full_rankings[ @@ -1321,7 +1323,7 @@ def add_to_rankings(self, pipeline): def results(self): """Class that allows access to a copy of the results from `automl_search`. - Returns: dict containing `pipeline_results`: a dict with results from each pipeline, + Returns dict containing `pipeline_results`: a dict with results from each pipeline, and `search_order`: a list describing the order the pipelines were searched. """ return copy.deepcopy(self._results) @@ -1388,7 +1390,7 @@ def save( ): """Saves AutoML object at file path. - Parameters + Args: file_path (str): location to save file pickle_type {"pickle", "cloudpickle"}: the pickling library to use. pickle_protocol (int): the pickle data stream format. @@ -1415,7 +1417,7 @@ def load( ): """Loads AutoML object at file path. - Parameters + Args: file_path (str): location to find file to load pickle_type {"pickle", "cloudpickle"}: the pickling library to use. Currently not used since the standard pickle library can handle cloudpickles. @@ -1430,7 +1432,7 @@ def train_pipelines(self, pipelines): This can be helpful for training pipelines once the search is complete. - Parameters + Args: pipelines (list(PipelineBase)): List of pipelines to train. Returns: @@ -1470,7 +1472,7 @@ def train_pipelines(self, pipelines): def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): """Score a list of pipelines on the given holdout data. - Parameters + Args: pipelines (list(PipelineBase)): List of pipelines to train. X_holdout (pd.DataFrame): Holdout features. y_holdout (pd.Series): Holdout targets for scoring. @@ -1522,7 +1524,7 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): @property def plot(self): - # Return an instance of the plot with the latest scores + """Return an instance of the plot with the latest scores.""" try: return PipelineSearchPlots(self.results, self.objective) except ImportError: diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index fe32ed7b62..3aa12c722b 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -1,3 +1,4 @@ +"""Callbacks available to pass to AutoML.""" from evalml.exceptions import PipelineScoreError from evalml.utils.logger import get_logger @@ -43,5 +44,5 @@ def log_error_callback(exception, traceback, automl, **kwargs): logger.info( f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}" ) - logger.debug(f"\t\t\tFold {fold_num}: Parameters:\n\t{pipeline.parameters}") + logger.debug(f"\t\t\tFold {fold_num}: Args::\n\t{pipeline.parameters}") logger.debug(f"\t\t\tFold {fold_num}: Traceback:\n{trace}") diff --git a/evalml/automl/engine/__init__.py b/evalml/automl/engine/__init__.py index 8c73578f84..7b188519b7 100644 --- a/evalml/automl/engine/__init__.py +++ b/evalml/automl/engine/__init__.py @@ -1,3 +1,4 @@ +"""EvalML Engine classes used to evaluate pipelines in AutoMLSearch.""" from .engine_base import ( EngineBase, EngineComputation, diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index d74fcd0fe9..c6b8cc025f 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -1,3 +1,4 @@ +"""Custom CFClient API to match Dask's CFClient and allow context management.""" from evalml.automl.engine.engine_base import ( EngineBase, EngineComputation, @@ -8,20 +9,21 @@ class CFClient: - """Custom CFClient API to match Dask's CFClient and allow context management.""" + """Custom CFClient API to match Dask's CFClient and allow context management. + + Args: + pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): the resource pool to execute the futures work on. + """ def __init__(self, pool): - """ - Parameters - pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): the resource pool - to execute the futures work on. - """ self.pool = pool def __enter__(self): + """Enter runtime context.""" return self def __exit__(self, typ, value, traceback): + """Exit runtime context.""" pass def submit(self, *args, **kwargs): @@ -30,22 +32,18 @@ def submit(self, *args, **kwargs): class CFComputation(EngineComputation): - """A Future-like wrapper around jobs created by the CFEngine.""" + """A Future-like wrapper around jobs created by the CFEngine. + + Args: + future(cf.Future): The concurrent.futures.Future that is desired to be executed. + """ def __init__(self, future): - """ - Parameters - future(cf.Future): The concurrent.futures.Future that is desired - to be executed. - """ self.work = future self.meta_data = {} def done(self): - """ - Returns: - bool: Whether the computation is done. - """ + """Returns whether the computation is done.""" return self.work.done() def get_result(self): @@ -55,6 +53,7 @@ def get_result(self): Exception: If computation fails. Returns traceback. cf.TimeoutError: If computation takes longer than default timeout time. cf.CancelledError: If computation was canceled before completing. + Returns: The result of the requested job. """ @@ -71,10 +70,7 @@ def cancel(self): @property def is_cancelled(self): - """ - Returns: - bool: Returns whether computation was cancelled. - """ + """Returns whether computation was cancelled.""" return self.work.cancelled() @@ -92,13 +88,14 @@ def __init__(self, client): def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send evaluation job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to evaluate - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - CFComputation: an object wrapping a reference to a future-like computation + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to evaluate. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + + Returns: + CFComputation: An object wrapping a reference to a future-like computation occurring in the resource pool """ logger = self.setup_job_log() @@ -115,13 +112,14 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send training job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to train - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - CFComputation: an object wrapping a reference to a future-like computation + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to train. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + + Returns: + CFComputation: An object wrapping a reference to a future-like computation occurring in the resource pool """ future = self.client.submit( @@ -134,14 +132,16 @@ def submit_scoring_job( ) -> EngineComputation: """Send scoring job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to train - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - CFComputation: a object wrapping a reference to a future-like computation - occurring in the resource pool + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to train. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + objectives (list(ObjectiveBase)): Objectives to score on. + + Returns: + CFComputation: An object wrapping a reference to a future-like computation + occurring in the resource pool. """ # Get the schema before we lose it X_schema = X.ww.schema diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index 64198d2a39..0a36856448 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -1,3 +1,4 @@ +"""A Future-like wrapper around jobs created by the DaskEngine.""" import joblib from dask.distributed import Client @@ -13,7 +14,7 @@ class DaskComputation(EngineComputation): """A Future-like wrapper around jobs created by the DaskEngine. - Parameters + Args: dask_future (callable): Computation to do. """ @@ -22,10 +23,7 @@ def __init__(self, dask_future): self.meta_data = {} def done(self): - """ - Returns: - bool: Whether the computation is done. - """ + """Returns whether the computation is done.""" return self.work.done() def get_result(self): @@ -42,10 +40,7 @@ def cancel(self): @property def is_cancelled(self): - """ - Returns: - bool: Returns whether computation was cancelled. - """ + """Returns whether computation was cancelled.""" return self.work.status @@ -66,7 +61,7 @@ def send_data_to_cluster(self, X, y): The implementation uses caching so the data is only sent once. This follows dask best practices. - Parameters + Args: X (pd.DataFrame): input data for modeling y (pd.Series): target data for modeling Returns @@ -85,13 +80,14 @@ def send_data_to_cluster(self, X, y): def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send evaluation job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to evaluate - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - DaskComputation: a object wrapping a reference to a future-like computation + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to evaluate. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + + Returns: + DaskComputation: An object wrapping a reference to a future-like computation occurring in the dask cluster """ logger = self.setup_job_log() @@ -109,13 +105,14 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: """Send training job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to train - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - DaskComputation: a object wrapping a reference to a future-like computation + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to train. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + + Returns: + DaskComputation: An object wrapping a reference to a future-like computation occurring in the dask cluster """ X, y = self.send_data_to_cluster(X, y) @@ -129,13 +126,15 @@ def submit_scoring_job( ) -> EngineComputation: """Send scoring job to cluster. - Parameters - automl_config: structure containing data passed from AutoMLSearch instance - pipeline (pipeline.PipelineBase): pipeline to train - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - DaskComputation: a object wrapping a reference to a future-like computation + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to train. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + objectives (list(ObjectiveBase)): List of objectives to score on. + + Returns: + DaskComputation: An object wrapping a reference to a future-like computation occurring in the dask cluster """ # Get the schema before we lose it diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index 7ef97054af..c95c45b848 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -1,3 +1,4 @@ +"""Base class for EvalML engines.""" import sys import time import traceback @@ -35,7 +36,7 @@ def cancel(self): class JobLogger: - """Mimics the behavior of a python logging.Logger but stores all messages rather than actually logging them. + """Mimic the behavior of a python logging.Logger but stores all messages rather than actually logging them. This is used during engine jobs so that log messages are recorded after the job completes. This is desired so that all of the messages @@ -78,8 +79,11 @@ def write_to_logger(self, logger): class EngineBase(ABC): + """Base class for EvalML engines.""" + @staticmethod def setup_job_log(): + """Set up logger for job.""" return JobLogger() @abstractmethod @@ -98,15 +102,15 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): def train_pipeline(pipeline, X, y, automl_config, schema=True): """Train a pipeline and tune the threshold if necessary. - Parameters + Args: pipeline (PipelineBase): Pipeline to train. X (pd.DataFrame): Features to train on. y (pd.Series): Target to train on. - automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback - schema (bool): Whether to use the schemas for X and y + automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback. + schema (bool): Whether to use the schemas for X and y. Defaults to True. Returns: - pipeline (PipelineBase): trained pipeline. + pipeline (PipelineBase): A trained pipeline instance. """ X_threshold_tuning = None y_threshold_tuning = None @@ -147,11 +151,11 @@ def train_and_score_pipeline( ): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores. - Parameters - pipeline (PipelineBase): The pipeline to score - automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback - full_X_train (pd.DataFrame): Training features - full_y_train (pd.Series): Training target + Args: + pipeline (PipelineBase): The pipeline to score. + automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback. + full_X_train (pd.DataFrame): Training features. + full_y_train (pd.Series): Training target. Returns: tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. @@ -288,13 +292,13 @@ def train_and_score_pipeline( def evaluate_pipeline(pipeline, automl_config, X, y, logger): - """Function submitted to the submit_evaluation_job engine method. + """Submit this function to the submit_evaluation_job engine method. - Parameters - pipeline (PipelineBase): The pipeline to score - automl_config (AutoMLConfig): The AutoMLSearch object, used to access config and the error callback - X (pd.DataFrame): Training features - y (pd.Series): Training target + Args: + pipeline (PipelineBase): The pipeline to score. + automl_config (AutoMLConfig): The AutoMLSearch object, used to access config and the error callback. + X (pd.DataFrame): Training features. + y (pd.Series): Training target. Returns: tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. @@ -315,14 +319,14 @@ def evaluate_pipeline(pipeline, automl_config, X, y, logger): def score_pipeline(pipeline, X, y, objectives, X_schema=None, y_schema=None): - """Wrapper around pipeline.score method to make it easy to score pipelines with dask. + """Wrap around pipeline.score method to make it easy to score pipelines with dask. - Parameters + Args: pipeline (PipelineBase): The pipeline to score. X (pd.DataFrame): Features to score on. y (pd.Series): Target used to calcualte scores. - X_schema (ww.TableSchema): Schema for features. - y_schema (ww.ColumnSchema): Schema for columns. + X_schema (ww.TableSchema): Schema for features. Defaults to None. + y_schema (ww.ColumnSchema): Schema for columns. Defaults to None. Returns: dict containing pipeline scores. diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 30461a8f01..dd05d3cc04 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -1,3 +1,4 @@ +"""A Future-like api for jobs created by the SequentialEngine, an Engine that sequentially computes the submitted jobs.""" from evalml.automl.engine.engine_base import ( EngineBase, EngineComputation, @@ -17,7 +18,7 @@ class SequentialComputation(EngineComputation): computation is "done", by always returning True in done() we make sure that get_result is called in the order that the jobs are submitted. So the computations happen sequentially! - Parameters + Args: work (callable): Computation that should be done by the engine. """ @@ -33,7 +34,8 @@ def done(self): def get_result(self): """Gets the computation result. Will block until the computation is finished. - Raises Exception: If computation fails. Returns traceback. + Raises: + Exception: If computation fails. Returns traceback. """ return self.work(**self.kwargs) @@ -48,6 +50,14 @@ class SequentialEngine(EngineBase): """ def submit_evaluation_job(self, automl_config, pipeline, X, y): + """Submit a job to evaluate a pipeline. + + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to evaluate. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + """ logger = self.setup_job_log() return SequentialComputation( work=evaluate_pipeline, @@ -59,6 +69,14 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y): ) def submit_training_job(self, automl_config, pipeline, X, y): + """Submit a job to train a pipeline. + + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to evaluate. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + """ return SequentialComputation( work=train_pipeline, pipeline=pipeline, @@ -69,6 +87,15 @@ def submit_training_job(self, automl_config, pipeline, X, y): ) def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): + """Submit a job to score a pipeline. + + Args: + automl_config: Structure containing data passed from AutoMLSearch instance. + pipeline (pipeline.PipelineBase): Pipeline to train. + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. + objectives (list(ObjectiveBase)): List of objectives to score on. + """ objectives = [get_objective(o, return_instance=True) for o in objectives] computation = SequentialComputation( work=score_pipeline, diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index cd2805eb68..f21c1a90cb 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -1,7 +1,14 @@ +"""Plots displayed during pipeline search.""" from evalml.utils import import_or_raise, jupyter_check class SearchIterationPlot: + """Search iteration plot. + + Args: + results (dict): Dictionary of current results. + objective (ObjectiveBase): Objective that AutoML is optimizing for. + """ def __init__(self, results, objective): self._go = import_or_raise( "plotly.graph_objects", @@ -35,6 +42,7 @@ def __init__(self, results, objective): self._go = None def update(self, results, objective): + """Update the search plot.""" if len(results["search_order"]) > 0 and len(results["pipeline_results"]) > 0: iter_idx = results["search_order"] pipeline_res = results["pipeline_results"] @@ -74,14 +82,14 @@ def update(self, results, objective): class PipelineSearchPlots: - """Plots for the AutoMLSearch class.""" + """Plots for the AutoMLSearch class during search. - def __init__(self, results, objective): - """Make plots for the AutoMLSearch class. + Args: + results (dict): Dictionary of current results. + objective (ObjectiveBase): Objective that AutoML is optimizing for. + """ - Parameters - data (AutoMLSearch): Automated pipeline search object - """ + def __init__(self, results, objective): self._go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects", diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 93b27eb8a4..219076cfe0 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -1,3 +1,4 @@ +"""Utilities useful in AutoML.""" from collections import namedtuple import pandas as pd @@ -32,11 +33,10 @@ def get_default_primary_search_objective(problem_type): """Get the default primary search objective for a problem type. - Parameters + Args: problem_type (str or ProblemType): problem type of interest. - Returns - ------- + Returns: ObjectiveBase: primary objective instance for the problem type. """ problem_type = handle_problem_types(problem_type) @@ -62,7 +62,7 @@ def make_data_splitter( ): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. - Parameters + Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. problem_type (ProblemType): The type of machine learning problem. @@ -105,7 +105,7 @@ def tune_binary_threshold( ): """Tunes the threshold of a binary pipeline to the X and y thresholding data. - Parameters + Args: pipeline (Pipeline): Pipeline instance to threshold. objective (ObjectiveBase): The objective we want to tune with. If not tuneable and best_pipeline is True, will use F1. problem_type (ProblemType): The problem type of the pipeline. @@ -129,7 +129,7 @@ def tune_binary_threshold( def check_all_pipeline_names_unique(pipelines): """Checks whether all the pipeline names are unique. - Parameters + Args: pipelines (list(PipelineBase)): List of pipelines to check if all names are unique. Returns: @@ -169,14 +169,14 @@ def check_all_pipeline_names_unique(pipelines): def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): """Returns the name of the sampler component to use for AutoMLSearch. - Parameters + Args: X (pd.DataFrame): The input feature data y (pd.Series): The input target data sampler_method (str): The sampler_type argument passed to AutoMLSearch sampler_balanced_ratio (float): The ratio of min:majority targets that we would consider balanced, or should balance the classes to. - Returns: + Returns str, None: The string name of the sampling component to use, or None if no sampler is necessary """ # we check for the class balances @@ -211,13 +211,13 @@ def get_pipelines_from_component_graphs( ): """Returns created pipelines from passed component graphs based on the specified problem type. - Parameters + Args: component_graphs_dict (dict): The dict of component graphs. problem_type (str or ProblemType): The problem type for which pipelines will be created. parameters (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. random_seed (int): Random seed. - Returns: + Returns list: List of pipelines made from the passed component graphs. """ pipeline_class = { diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 77ed5af741..0a9410c7eb 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -1,5 +1,4 @@ -""" -Data check that checks if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. +"""Data check that checks if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. """ @@ -15,8 +14,7 @@ class ClassImbalanceDataCheck(DataCheck): """Check if any of the target labels are imbalanced, or if the number of values for each target are below 2 times the number of CV folds. Use for classification problems. - Arguments - --------- + Args: threshold (float): The minimum threshold allowed for class imbalance before a warning is raised. This threshold is calculated by comparing the number of samples in each class to the sum of samples in that class and the majority class. For example, a multiclass case with [900, 900, 100] samples per classes 0, 1, and 2, respectively, @@ -50,18 +48,15 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): def validate(self, X, y): """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. - Arguments - --------- + Args: X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target labels to check for imbalanced data. - Returns - ------- + Returns: dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. - Examples - -------- + Example: >>> import pandas as pd >>> X = pd.DataFrame() >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) diff --git a/evalml/data_checks/data_check.py b/evalml/data_checks/data_check.py index 418e07a99a..0cd32674e0 100644 --- a/evalml/data_checks/data_check.py +++ b/evalml/data_checks/data_check.py @@ -21,13 +21,11 @@ def name(cls): def validate(self, X, y=None): """Inspect and validate the input data, runs any necessary calculations or algorithms, and returns a list of warnings and errors if applicable. - Arguments - --------- + Args: X (pd.DataFrame): The input data of shape [n_samples, n_features] y (pd.Series, optional): The target data of length [n_samples] - Returns - ------- + Returns: dict (DataCheckMessage): Dictionary of DataCheckError and DataCheckWarning messages """ diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 52acd7fe02..778c83eed9 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -4,8 +4,7 @@ class DataCheckAction: """Recommended action returned by a DataCheck. - Arguments - --------- + Args: action_code (DataCheckActionCode): Action code associated with the action. metadata (dict, optional): Additional useful information associated with the action. Defaults to None. """ @@ -15,8 +14,7 @@ def __init__(self, action_code, metadata=None): self.metadata = metadata or {} def __eq__(self, other): - """ - Check for equality. + """Check for equality. Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent. diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 723555cb10..7d7b884c7b 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -3,11 +3,9 @@ class DataCheckMessage: - """ - Base class for a message returned by a DataCheck, tagged by name. + """Base class for a message returned by a DataCheck, tagged by name. - Arguments - --------- + Args: message (str): Message string. data_check_name (str): Name of data check. message_code (DataCheckMessageCode): Message code associated with message. Defaults to None. diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 29f696567e..cc6f3c51d1 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -17,11 +17,9 @@ def _has_defaults_for_all_args(init): class DataChecks: - """ - A collection of data checks. + """A collection of data checks. - Arguments - --------- + Args: data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. """ @@ -70,7 +68,7 @@ def _init_data_checks(data_check_classes, params): class_params = params.get(data_check_class.name, {}) if not isinstance(class_params, dict): raise DataCheckInitError( - f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}." + f"Parameters: for {data_check_class.name} were not in a dictionary. Received {class_params}." ) try: data_check_instances.append(data_check_class(**class_params)) @@ -89,13 +87,11 @@ def __init__(self, data_checks=None, data_check_params=None): def validate(self, X, y=None): """Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. - Arguments - --------- + Args: X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] - Returns - ------- + Returns: dict: Dictionary containing DataCheckMessage objects """ messages = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index f185916543..4626cd1988 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -8,8 +8,7 @@ class DateTimeFormatDataCheck(DataCheck): """Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. - Parameters - ---------- + Args: datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index". """ @@ -19,17 +18,14 @@ def __init__(self, datetime_column="index"): def validate(self, X, y): """Check if the target data has equal intervals and is sorted. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Target data. - Returns - ------- + Returns: dict (DataCheckError): List with DataCheckErrors if unequal intervals are found in the datetime column. - Examples - -------- + Example: >>> from pandas as pd >>> X = pd.DataFrame(pd.date_range("January 1, 2021", periods=8), columns=["dates"]) >>> y = pd.Series([1, 2, 4, 2, 1, 2, 3, 1]) @@ -43,7 +39,6 @@ def validate(self, X, y): ... "details": {}}], ... "warnings": [], ... "actions": []} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index 55a2e9d60a..2724d82e5f 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -12,17 +12,14 @@ class DateTimeNaNDataCheck(DataCheck): def validate(self, X, y=None): """Check if any datetime columns contain NaN values. - Arguments - --------- + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns - ------- + Returns: dict: dict with a DataCheckError if NaN values are present in datetime columns. - Examples - -------- + Example: >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 2d1c5cd7fb..74f51df0c8 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -35,8 +35,7 @@ class DefaultDataChecks(DataChecks): - `TargetDistributionDataCheck` (for regression problem types) - `DateTimeFormatDataCheck` (for time series problem types) - Arguments - --------- + Args: problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 9b0b1c1ce1..b200929251 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -13,8 +13,7 @@ class HighlyNullDataCheck(DataCheck): """Check if there are any highly-null columns and rows in the input. - Parameters - ---------- + Args: pct_null_col_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, that column will be considered highly-null. Defaults to 0.95. pct_null_row_threshold(float): If the percentage of NaN values in an input row exceeds this amount, @@ -37,17 +36,14 @@ def __init__(self, pct_null_col_threshold=0.95, pct_null_row_threshold=0.95): def validate(self, X, y=None): """Check if there are any highly-null columns or rows in the input. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. - y (pd.Series, np.ndarray): Ignored. + y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns - ------- + Returns: dict: dict with a DataCheckWarning if there are any highly-null columns or rows. - Examples - -------- + Example: >>> import pandas as pd >>> class SeriesWrap(): ... def __init__(self, series): @@ -78,7 +74,6 @@ def validate(self, X, y=None): ... "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}], ... "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}}, ... {"code": "DROP_COL", "metadata": {"column": "lots_of_null"}}]} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index d6bb90e782..b875428cec 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -12,8 +12,7 @@ class IDColumnsDataCheck(DataCheck): """Check if any of the features are likely to be ID columns. - Parameters - ---------- + Args: id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0. """ @@ -29,17 +28,14 @@ def validate(self, X, y=None): - column name ends in "_id" - column contains all unique values (and is categorical / integer type) - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): The input features to check.T y (pd.Series): The target. Defaults to None. Ignored. - Returns - ------- + Returns: dict: A dictionary of features with column name or index and their probability of being ID columns - Examples - -------- + Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'df_id': [0, 1, 2, 3, 4], diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 711f6a25ab..8f1c5e8ac0 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -26,8 +26,7 @@ class InvalidTargetDataCheck(DataCheck): """Check if the target data contains missing or invalid values. - Arguments - --------- + Args: problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' objective (str or ObjectiveBase): Name or instance of the objective class. @@ -47,17 +46,14 @@ def __init__(self, problem_type, objective, n_unique=100): def validate(self, X, y): """Check if the target data contains missing or invalid values. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for invalid values. Returns - ------- dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. - Examples - -------- + Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 583db4ae16..59d8fb6170 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -10,8 +10,7 @@ class MulticollinearityDataCheck(DataCheck): """Check if any set features are likely to be multicollinear. - Parameters - ---------- + Args: threshold (float): The threshold to be considered. Defaults to 0.9. """ @@ -23,17 +22,14 @@ def __init__(self, threshold=0.9): def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. - Parameters - ---------- + Args: X (pd.DataFrame): The input features to check. y (pd.Series): The target. Ignored. - Returns - ------- + Returns: dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. - Examples - -------- + Example: >>> import pandas as pd >>> col = pd.Series([1, 0, 2, 3, 4]) >>> X = pd.DataFrame({"col_1": col, "col_2": col * 3}) diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 0789bc7e62..1fba84674f 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -9,20 +9,17 @@ class NaturalLanguageNaNDataCheck(DataCheck): """Checks each column in the input for natural language features and will issue an error if NaN values are present.""" def validate(self, X, y=None): - """ - Check if any natural language columns contain NaN values. + """Check if any natural language columns contain NaN values. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. Returns - ------- dict: dict with a DataCheckError if NaN values are present in natural language columns. - Examples - -------- + Example: + >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 57dfd926c9..2e84d19ebf 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -14,33 +14,27 @@ class NoVarianceDataCheck(DataCheck): - """ - Check if the target or any of the features have no variance. + """Check if the target or any of the features have no variance. - Parameters - ---------- + Args: count_nan_as_value (bool): If True, missing values will be counted as their own unique value. Additionally, if true, will return a DataCheckWarning instead of an error if the feature has mostly missing data and only one unique value. Defaults to False. - """ def __init__(self, count_nan_as_value=False): self._dropnan = not count_nan_as_value def _check_for_errors(self, column_name, count_unique, any_nulls): - """ - Check if a column has no variance. + """Check if a column has no variance. - Parameters - ---------- + Args: column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. - Returns - ------- + Returns: DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. """ message = f"{column_name} has {int(count_unique)} unique value." @@ -64,16 +58,13 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): ) def validate(self, X, y): - """ - Check if the target or any of the features have no variance (1 unique value). + """Check if the target or any of the features have no variance (1 unique value). - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. - Returns - ------- + Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index ee3f82e93f..4a7b5cfebf 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -11,8 +11,7 @@ class OutliersDataCheck(DataCheck): - """ - Checks if there are any outliers in input data by using IQR to determine score anomalies. + """Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers. """ @@ -20,17 +19,14 @@ class OutliersDataCheck(DataCheck): def validate(self, X, y=None): """Check if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Input features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns - ------- + Returns: dict: A dictionary with warnings if any columns have outliers. - Examples - -------- + Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5], @@ -90,13 +86,11 @@ def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: Original credit goes to Jad Raad and Dan Putler of Alteryx. - Parameters - ---------- + Args: num_records (int): The integer number of non-missing values in a column. pct_outliers (float): The percentage of potential outliers in a column. - Returns - ------- + Returns: float: The probability that no outliers are present in the column. """ # Calculate the shape and scale parameters of the approximate diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index 177604cee4..acab25fbdf 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -15,8 +15,7 @@ class SparsityDataCheck(DataCheck): """Check if there are any columns with sparsely populated values in the input. - Parameters - ---------- + Args: problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. threshold (float): The threshold value, or percentage of each column's unique values, @@ -40,17 +39,14 @@ def __init__(self, problem_type, threshold, unique_count_threshold=10): def validate(self, X, y=None): """Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. - Returns - ------- + Returns: dict: dict with a DataCheckWarning if there are any sparse columns. - Examples - -------- + Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], @@ -100,17 +96,14 @@ def validate(self, X, y=None): @staticmethod def sparsity_score(col, count_threshold=10): - """ - Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. + """Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. - Parameters - ---------- + Args: col (pd.Series): Feature values. count_threshold (int): The number of instances below which a value is considered sparse. Default is 10. - Returns - ------- + Returns: (float): Sparsity score, or the percentage of the unique values that exceed count_threshold. """ counts = col.value_counts() diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index 72ee5c1160..8e187b7f2e 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -20,17 +20,15 @@ class TargetDistributionDataCheck(DataCheck): def validate(self, X, y): """Check if the target data has a certain distribution. - Parameters - ---------- + Args: + X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. - Returns - ------- + Returns: dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. - Examples - -------- + Example: >>> from scipy.stats import lognorm >>> X = None >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] @@ -43,7 +41,6 @@ def validate(self, X, y): ... "code": "TARGET_LOGNORMAL_DISTRIBUTION", ... "details": {"shapiro-statistic/pvalue": '0.84/0.045'}}], ... "actions": [{'code': 'TRANSFORM_TARGET', 'metadata': {'column': None, 'transformation_strategy': 'lognormal', 'is_target': True}}]} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index b179f7a06b..2816bcab74 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -22,8 +22,7 @@ class TargetLeakageDataCheck(DataCheck): Otherwise, if `method='pearson'`, it uses Pearson correlation and only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. - Parameters - ---------- + Args: pct_corr_threshold (float): The correlation threshold to be considered leakage. Defaults to 0.95. method (string): The method to determine correlation. Use 'mutual' for mutual information, otherwise 'pearson' for Pearson correlation. Defaults to 'mutual'. """ @@ -73,17 +72,15 @@ def validate(self, X, y): If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. - Parameters - ---------- + Args: + X (pd.DataFrame, np.ndarray): The input features to check y (pd.Series, np.ndarray): The target data - Returns - ------- + Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. - Examples - -------- + Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 14c959e3c7..5f799f59ae 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -22,8 +22,7 @@ class UniquenessDataCheck(DataCheck): """Check if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. - Arguments - --------- + Args: problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems @@ -39,18 +38,15 @@ def __init__(self, problem_type, threshold=0.50): def validate(self, X, y=None): """Check if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. - Parameters - ---------- + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns - ------- + Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. - Examples - -------- + Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], @@ -133,12 +129,10 @@ def uniqueness_score(col): Based on the Herfindahl–Hirschman Index. - Parameters - ---------- + Args: col (pd.Series): Feature values. - Returns - ------- + Returns: (float): Uniqueness score. """ norm_counts = col.value_counts() / col.value_counts().sum() diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py index d8d7e88190..073abf30a8 100644 --- a/evalml/data_checks/utils.py +++ b/evalml/data_checks/utils.py @@ -5,8 +5,7 @@ class EmptyDataChecks(DataChecks): """An empty collection of data checks. - Parameters - ---------- + Args: data_checks (list (DataCheck)): Ignored. """ diff --git a/evalml/demos/breast_cancer.py b/evalml/demos/breast_cancer.py index 47aa014701..fd83efdbd6 100644 --- a/evalml/demos/breast_cancer.py +++ b/evalml/demos/breast_cancer.py @@ -8,8 +8,7 @@ def load_breast_cancer(): """Load breast cancer dataset. Binary classification problem. - Returns - ------ + Returns: (pd.Dataframe, pd.Series): X and y """ filepath = ( diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index e64f50bc18..385fa29ada 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -6,13 +6,11 @@ def load_churn(n_rows=None, verbose=True): """Load churn dataset, which can be used for binary classification problems. - Parameters - --------- + Args: n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Returns - ------ + Returns: (pd.Dataframe, pd.Series): X and y """ churn_data_path = ( diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index f1eabcc970..4ee5ac9971 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -9,9 +9,8 @@ def load_diabetes(): """Load diabetes dataset. Used for regression problem. - Returns - ------ - (pd.Dataframe, pd.Series): X and y + Returns: + pd.Dataframe, pd.Series): X and y """ filename = ( "https://api.featurelabs.com/datasets/diabetes.csv?library=evalml&version=" diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index 9a2cb42b31..3a549ad15e 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -8,13 +8,11 @@ def load_fraud(n_rows=None, verbose=True): The fraud dataset can be used for binary classification problems. - Parameters - --------- + Args: n_rows (int): Number of rows from the dataset to return verbose (bool): Whether to print information about features and labels - Returns - ------ + Returns: (pd.Dataframe, pd.Series): X and y """ fraud_data_path = ( diff --git a/evalml/demos/wine.py b/evalml/demos/wine.py index b06f1a5783..e9fee6dba3 100644 --- a/evalml/demos/wine.py +++ b/evalml/demos/wine.py @@ -8,8 +8,7 @@ def load_wine(): """Load wine dataset. Multiclass problem. - Returns - ------ + Returns: (pd.Dataframe, pd.Series): X and y """ filepath = ( diff --git a/evalml/exceptions/__init__.py b/evalml/exceptions/__init__.py index 3bd8d3c551..edcf617321 100644 --- a/evalml/exceptions/__init__.py +++ b/evalml/exceptions/__init__.py @@ -1,3 +1,4 @@ +"""Exception used in EvalML.""" from .exceptions import ( MethodPropertyNotFoundError, PipelineNotFoundError, diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py index aac178c782..72140fe75d 100644 --- a/evalml/exceptions/exceptions.py +++ b/evalml/exceptions/exceptions.py @@ -1,3 +1,4 @@ +"""Exceptions used in EvalML.""" from enum import Enum @@ -20,7 +21,7 @@ class ObjectiveNotFoundError(Exception): class MissingComponentError(Exception): - """An exception raised when a component is not found in all_components()""" + """An exception raised when a component is not found in all_components().""" pass @@ -52,7 +53,7 @@ class EnsembleMissingPipelinesError(Exception): class PipelineScoreError(Exception): """An exception raised when a pipeline errors while scoring any objective in a list of objectives. - Parameters + Args: exceptions (dict): A dictionary mapping an objective name (str) to a tuple of the form (exception, traceback). All of the objectives that errored will be stored here. scored_successfully (dict): A dictionary mapping an objective name (str) to a score value. All of the objectives @@ -98,7 +99,7 @@ class ParameterNotUsedWarning(UserWarning): def __init__(self, components): self.components = components - msg = f"Parameters for components {components} will not be used to instantiate the pipeline since they don't appear in the pipeline" + msg = f"Args: for components {components} will not be used to instantiate the pipeline since they don't appear in the pipeline" super().__init__(msg) diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index f232bdb593..f18b82ad4d 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -4,10 +4,10 @@ def handle_model_family(model_family): """Handles model_family by either returning the ModelFamily or converting from a string. - Parameters + Args: model_family (str or ModelFamily): Model type that needs to be handled - Returns: + Returns ModelFamily """ diff --git a/evalml/model_understanding/__init__.py b/evalml/model_understanding/__init__.py index fb5bdc7238..002b97a63e 100644 --- a/evalml/model_understanding/__init__.py +++ b/evalml/model_understanding/__init__.py @@ -1,3 +1,4 @@ +"""Model understanding tools.""" from .graphs import ( binary_objective_vs_threshold, confusion_matrix, diff --git a/evalml/model_understanding/force_plots.py b/evalml/model_understanding/force_plots.py index 02e8d7ac4a..5f9a337e32 100644 --- a/evalml/model_understanding/force_plots.py +++ b/evalml/model_understanding/force_plots.py @@ -1,3 +1,4 @@ +"""Force plots.""" import numpy as np import shap from shap import initjs @@ -89,7 +90,6 @@ def force_plot(pipeline, rows_to_explain, training_data, y): TypeError: if rows_to_explain is not a list. TypeError: if all values in rows_to_explain aren't integers. """ - if not isinstance(rows_to_explain, list): raise TypeError( "rows_to_explain should be provided as a list of row index integers!" diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 5f039ee83a..120bbdeafb 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -44,7 +44,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): """Confusion matrix for binary and multiclass classification. - Parameters + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -67,7 +67,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): def normalize_confusion_matrix(conf_mat, normalize_method="true"): """Normalize a confusion matrix. - Parameters + Args: conf_mat (pd.DataFrame or np.ndarray): Confusion matrix to normalize. normalize_method ({'true', 'pred', 'all'}): Normalization method. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -106,7 +106,7 @@ def graph_confusion_matrix( If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'. - Parameters + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. @@ -184,7 +184,7 @@ def graph_confusion_matrix( def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): """Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. - Parameters + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. pos_label_idx (int): the column index corresponding to the positive class. If predicted probabilities are two-dimensional, this will be used to access the probabilities for the positive class. @@ -222,7 +222,7 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): """Generate and display a precision-recall plot. - Parameters + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. title_addition (str or None): If not None, append to plot title. Default None. @@ -261,7 +261,7 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): def roc_curve(y_true, y_pred_proba): """Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. - Parameters + Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. @@ -310,7 +310,7 @@ def roc_curve(y_true, y_pred_proba): def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_addition=None): """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems. - Parameters + Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. custom_class_labels (list or None): If not None, custom labels for classes. Default None. @@ -369,7 +369,7 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio def graph_permutation_importance(pipeline, X, y, objective, importance_threshold=0): """Generate a bar graph of the pipeline's permutation importance. - Parameters + Args: pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame): The input data used to score and compute permutation importance y (pd.Series): The target data @@ -428,7 +428,7 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Compute objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. - Parameters + Args: pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline X (pd.DataFrame): The input data used to compute objective score y (pd.Series): The target labels @@ -460,7 +460,7 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Generate a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. - Parameters + Args: pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame): The input data used to score and compute scores y (pd.Series): The target labels @@ -570,10 +570,10 @@ def partial_dependence( is calculated with the first feature in the y-axis and second feature in the x-axis. - Parameters - pipeline (PipelineBase or subclass): Fitted pipeline + Args: + pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at + for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. @@ -618,7 +618,6 @@ def partial_dependence( PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. """ - try: # Dynamically set the grid resolution to the maximum number of values # in the categorical/datetime variables if there are more categories/datetime values than resolution cells @@ -933,7 +932,7 @@ def graph_partial_dependence( ): """Create an one-way or two-way partial dependence plot. Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. - Parameters + Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. @@ -950,7 +949,7 @@ def graph_partial_dependence( (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. - Returns: + Returns:: plotly.graph_objects.Figure: figure object containing the partial dependence data for plotting Raises: @@ -1176,7 +1175,7 @@ def _calculate_axis_range(arr): def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): """Combine y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. - Parameters + Args: y_true (pd.Series, or np.ndarray): The real target values of the data y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1215,7 +1214,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): """Generate a scatter plot comparing the true and predicted values. Used for regression plotting. - Parameters + Args: y_true (pd.Series): The real target values of the data y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1298,7 +1297,7 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): """Return data for a fitted tree in a restructured format. - Parameters + Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: @@ -1321,7 +1320,7 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): """Return data for a fitted pipeline with in a restructured format. - Parameters + Args: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: @@ -1347,7 +1346,7 @@ def visualize_decision_tree( ): """Generate an image visualizing the decision tree. - Parameters + Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. @@ -1423,7 +1422,7 @@ def visualize_decision_tree( def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): """Get the data needed for the prediction_vs_actual_over_time plot. - Parameters + Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1432,7 +1431,6 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): Returns: pd.DataFrame """ - dates = infer_feature_types(dates) y = infer_feature_types(y) prediction = pipeline.predict(X, y) @@ -1449,7 +1447,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): """Plot the target values and predictions against time on the x-axis. - Parameters + Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1499,7 +1497,7 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): def get_linear_coefficients(estimator, features=None): """Return a dataframe showing the features with the greatest predictive power for a linear model. - Parameters + Args: estimator (Estimator): Fitted linear model family estimator. features (list[str]): List of feature names associated with the underlying data. @@ -1535,7 +1533,7 @@ def t_sne( ): """Get the transformed output after fitting X to the embedded space using t-SNE. - Parameters + Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning @@ -1578,7 +1576,7 @@ def graph_t_sne( ): """Plot high dimensional data into lower dimensional space using t-SNE . - Parameters + Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index d9314a22ca..bf492b05fe 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -13,7 +13,7 @@ def calculate_permutation_importance( ): """Calculates permutation importance for features. - Parameters + Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. @@ -22,6 +22,7 @@ def calculate_permutation_importance( n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. + Returns: pd.DataFrame: Mean feature importance scores over a number of shuffles. """ @@ -77,7 +78,7 @@ def calculate_permutation_importance_one_column( ): """Calculates permutation importance for one column in the original dataframe. - Parameters + Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. diff --git a/evalml/model_understanding/prediction_explanations/__init__.py b/evalml/model_understanding/prediction_explanations/__init__.py index 7b5ec80f5c..91132735a4 100644 --- a/evalml/model_understanding/prediction_explanations/__init__.py +++ b/evalml/model_understanding/prediction_explanations/__init__.py @@ -1 +1,2 @@ +"""Prediction explanation tools.""" from .explainers import explain_predictions_best_worst, explain_predictions diff --git a/evalml/model_understanding/prediction_explanations/_algorithms.py b/evalml/model_understanding/prediction_explanations/_algorithms.py index 69a3c7631d..9e71afee2d 100644 --- a/evalml/model_understanding/prediction_explanations/_algorithms.py +++ b/evalml/model_understanding/prediction_explanations/_algorithms.py @@ -15,12 +15,12 @@ def _create_dictionary(shap_values, feature_names): """Creates a mapping from a feature name to a list of SHAP values for all points that were queried. - Parameters + Args: shap_values (np.ndarray): SHAP values stored in an array of shape (n_datapoints, n_features). feature_names (Iterable): Iterable storing the feature names as they are ordered in the dataset. Returns: - dictionary + dict """ if not isinstance(shap_values, np.ndarray): raise ValueError("SHAP values must be stored in a numpy array!") @@ -34,7 +34,7 @@ def _create_dictionary(shap_values, feature_names): def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. - Parameters + Args: pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. @@ -147,15 +147,14 @@ def _aggreggate_shap_values_dict(values, provenance): This aggregation will happen for all features for which we know the provenance/lineage. Other features will be left as they are. - Parameters - values (dict): A mapping of feature names to a list of SHAP values for each data point. + Args: + values (dict): A mapping of feature names to a list of SHAP values for each data point. provenance (dict): A mapping from a feature in the original data to the names of the features that were created from that feature. Returns: - dict - mapping from feature name to shap values. + dict: Dictionary mapping from feature name to shap values. """ - child_to_parent = {} for parent_feature, children in provenance.items(): for child in children: @@ -179,12 +178,10 @@ def _aggreggate_shap_values_dict(values, provenance): def _aggregate_shap_values(values, provenance): """Aggregates shap values across features created from a common feature. - Parameters + Args: values (dict): A mapping of feature names to a list of SHAP values for each data point. provenance (dict): A mapping from a feature in the original data to the names of the features that were created from that feature - Returns: - dict Returns: dict or list(dict) @@ -201,18 +198,17 @@ def _aggregate_shap_values(values, provenance): def _normalize_values_dict(values): """Normalizes SHAP values by dividing by the sum of absolute values for each feature. - Parameters + Args: values (dict): A mapping of feature names to a list of SHAP values for each data point. Returns: dict - Examples: + Example: >>> values = {"a": [1, -1, 3], "b": [3, -2, 0], "c": [-1, 3, 4]} >>> normalized_values = _normalize_values_dict(values) >>> assert normalized_values == {"a": [1/5, -1/6, 3/7], "b": [3/5, -2/6, 0/7], "c": [-1/5, 3/6, 4/7]} """ - # Store in matrix of shape (len(values), n_features) feature_names = list(values.keys()) all_values = np.stack([values[feature_name] for feature_name in feature_names]).T @@ -231,7 +227,7 @@ def _normalize_values_dict(values): def _normalize_shap_values(values): """Normalizes the SHAP values by the absolute value of their sum for each data point. - Parameters + Args: values (dict or list(dict)): Dictionary mapping feature name to list of values, or a list of dictionaries (each mapping a feature name to a list of values). diff --git a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py index eb0aed8bed..76ceb787ac 100644 --- a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py +++ b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py @@ -27,7 +27,7 @@ def _report_creator_factory( ): """Get and initialize the report creator class given the ReportData and parameters passed in by the user. - Parameters + Args: data (_ReportData): Data about the problem (pipeline/predicted values, etc) needed for the report. report_type (str): Either "explain_predictions" or "explain_predictions_best_worst" output_format (str): Either "text" or "dict" - passed in by user. diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index 119129c219..cc59f29024 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -22,7 +22,7 @@ def _make_rows( ): """Makes the rows (one row for each feature) for the SHAP table. - Parameters + Args: shap_values (dict): Dictionary mapping the feature names to their SHAP values. In a multiclass setting, this dictionary for correspond to the SHAP values for a single class. normalized_values (dict): Normalized SHAP values. Same structure as shap_values parameter. @@ -80,7 +80,6 @@ def _make_rows( def _rows_to_dict(rows): """Turns a list of lists into a dictionary.""" - feature_names = [] feature_values = [] qualitative_explanations = [] @@ -128,7 +127,7 @@ def _make_text_table( ): """Make a table displaying the SHAP values for a prediction. - Parameters + Args: shap_values (dict): Dictionary mapping the feature names to their SHAP values. In a multiclass setting, this dictionary for correspond to the SHAP values for a single class. normalized_values (dict): Normalized SHAP values. Same structure as shap_values parameter. @@ -484,7 +483,7 @@ def _make_single_prediction_shap_table( ): """Creates table summarizing the top_k_features positive and top_k_features negative contributing features to the prediction of a single datapoint. - Parameters + Args: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. pipeline_features (pd.DataFrame): Dataframe of features computed by the pipeline. input_features (pd.DataFrame): Dataframe of features passed to the pipeline. This is where the pipeline_features @@ -595,7 +594,7 @@ def make_text(self, rank): Differences between best/worst reports and reports where user manually specifies the input features subset are handled by formatting the value of the prefix parameter in the initialization. - Parameters + Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ prefix = self.prefixes[(rank // self.n_indices)] @@ -605,7 +604,7 @@ def make_text(self, rank): def make_dict(self, rank): """Makes the heading section for reports formatted as dictionaries. - Parameters + Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ prefix = self.prefixes[(rank // self.n_indices)] @@ -615,7 +614,7 @@ def make_dict(self, rank): def make_dataframe(self, rank): """Makes the heading section for reports formatted as a dataframe. - Parameters + Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. """ return self.make_dict(rank) @@ -634,7 +633,7 @@ def __init__(self, error_name, y_pred_values): def make_text(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for classification problem best/worst reports formatted as text. - Parameters + Args: index (int): The index of the prediction in the dataset. y_pred (pd.Series): Pipeline predictions on the entire dataset. y_true (pd.Series): Targets for the entire dataset. @@ -687,7 +686,7 @@ def __init__(self, error_name, y_pred_values=None): def make_text(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for regression problem best/worst reports formatted as text. - Parameters + Args: index (int): The index of the prediction in the dataset. y_pred (pd.Series): Pipeline predictions on the entire dataset. y_true (pd.Series): Targets for the entire dataset. @@ -734,7 +733,7 @@ def make_text(self, index, pipeline, pipeline_features, input_features): Handling the differences in how the table is formatted between regression and classification problems is delegated to the _make_single_prediction_shap_table - Parameters + Args: index (int): The index of the prediction in the dataset. pipeline (PipelineBase): The pipeline to explain. pipeline_features (pd.DataFrame): The dataframe of features created by the pipeline. @@ -803,7 +802,7 @@ def __init__(self, heading_maker, predicted_values_maker, table_maker): def make_text(self, data): """Make a prediction explanation report that is formatted as text. - Parameters + Args: data (_ReportData): Data passed in by the user. Returns: @@ -834,7 +833,7 @@ def make_text(self, data): def make_dict(self, data): """Make a prediction explanation report that is formatted as a dictionary. - Parameters + Args: data (_ReportData): Data passed in by the user. Returns: diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index be6277c377..b5b9fe0c9e 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -1,3 +1,4 @@ +"""Prediction explanation tools.""" import sys import traceback from collections import namedtuple @@ -47,7 +48,7 @@ def explain_predictions( XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. - Parameters + Args: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Dataframe of input data to evaluate the pipeline on. y (pd.Series): Labels for the input data. @@ -108,10 +109,11 @@ def explain_predictions( def _update_progress(start_time, current_time, progress_stage, callback_function): - """Helper function for updating progress of a function and making a call to the user-provided callback function, if provided. The callback function should accept the following parameters: - - - progress_stage: stage of computation - - time_elapsed: total time in seconds that has elapsed since start of call + """Helper function for updating progress of a function and making a call to the user-provided callback function, if provided. + + The callback function should accept the following parameters: + - progress_stage: stage of computation + - time_elapsed: total time in seconds that has elapsed since start of call """ if callback_function is not None: elapsed_time = current_time - start_time @@ -119,6 +121,7 @@ def _update_progress(start_time, current_time, progress_stage, callback_function class ExplainPredictionsStage(Enum): + """Enum for prediction stage.""" PREPROCESSING_STAGE = "preprocessing_stage" PREDICT_STAGE = "predict_stage" COMPUTE_FEATURE_STAGE = "compute_feature_stage" @@ -141,7 +144,7 @@ def explain_predictions_best_worst( XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. - Parameters + Args: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Input data to evaluate the pipeline on. y_true (pd.Series): True labels for the input data. @@ -183,7 +186,7 @@ def explain_predictions_best_worst( ) if y_true.shape[0] != input_features.shape[0]: raise ValueError( - "Parameters y_true and input_features must have the same number of data points. Received: " + "Args: y_true and input_features must have the same number of data points. Received: " f"true labels: {y_true.shape[0]} and {input_features.shape[0]}" ) if output_format not in {"text", "dict", "dataframe"}: @@ -268,7 +271,7 @@ def explain_predictions_best_worst( def abs_error(y_true, y_pred): """Computes the absolute error per data point for regression problems. - Parameters + Args: y_true (pd.Series): True labels. y_pred (pd.Series): Predicted values. @@ -281,7 +284,7 @@ def abs_error(y_true, y_pred): def cross_entropy(y_true, y_pred_proba): """Computes Cross Entropy Loss per data point for classification problems. - Parameters + Args: y_true (pd.Series): True labels encoded as ints. y_pred_proba (pd.DataFrame): Predicted probabilities. One column per class. diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index a28bcc78fa..608ae037c9 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -25,7 +25,7 @@ def can_optimize_threshold(cls): def optimize_threshold(self, ypred_proba, y_true, X=None): """Learn a binary classification threshold which optimizes the current objective. - Parameters + Args: ypred_proba (pd.Series): The classifier's predicted probabilities y_true (pd.Series): The ground truth for the predictions. X (pd.DataFrame, optional): Any extra columns that are needed from training data. @@ -57,7 +57,7 @@ def cost(threshold): def decision_function(self, ypred_proba, threshold=0.5, X=None): """Apply a learned threshold to predicted probabilities to get predicted classes. - Parameters + Args: ypred_proba (pd.Series, np.ndarray): The classifier's predicted probabilities threshold (float, optional): Threshold used to make a prediction. Defaults to 0.5. X (pd.DataFrame, optional): Any extra columns that are needed from training data. diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index caba301868..e87ac983d8 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -8,7 +8,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): """Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow. - Parameters + Args: true_positive (float): Cost associated with true positive predictions true_negative (float): Cost associated with true negative predictions false_positive (float): Cost associated with false positive predictions @@ -25,7 +25,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): def __init__(self, true_positive, true_negative, false_positive, false_negative): if None in {true_positive, true_negative, false_positive, false_negative}: raise ValueError( - "Parameters to CostBenefitMatrix must all be numeric values." + "Args: to CostBenefitMatrix must all be numeric values." ) self.true_positive = true_positive @@ -36,7 +36,7 @@ def __init__(self, true_positive, true_negative, false_positive, false_negative) def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): """Calculates cost-benefit of the using the predicted and true values. - Parameters + Args: y_predicted (pd.Series): Predicted labels y_true (pd.Series): True labels X (pd.DataFrame): Ignored. diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 20905be496..975ddff3c3 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -4,7 +4,7 @@ class FraudCost(BinaryClassificationObjective): """Score the percentage of money lost of the total transaction amount process due to fraud. - Parameters + Args: retry_percentage (float): What percentage of customers that will retry a transaction if it is declined. Between 0 and 1. Defaults to 0.5. interchange_fee (float): How much of each successful transaction you pay. @@ -36,7 +36,7 @@ def __init__( def objective_function(self, y_true, y_predicted, X, sample_weight=None): """Calculate amount lost to fraud per transaction given predictions, true values, and dataframe with transaction amount. - Parameters + Args: y_predicted (pd.Series): Predicted fraud labels y_true (pd.Series): True fraud labels X (pd.DataFrame): Data with transaction amounts diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py index b9f6145cbd..3f10edf5e1 100644 --- a/evalml/objectives/lead_scoring.py +++ b/evalml/objectives/lead_scoring.py @@ -6,7 +6,7 @@ class LeadScoring(BinaryClassificationObjective): """Lead scoring. - Parameters + Args: true_positives (int): Reward for a true positive. Defaults to 1. false_positives (int): Cost for a false positive. Should be negative. Defaults to -1. """ @@ -25,7 +25,7 @@ def __init__(self, true_positives=1, false_positives=-1): def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): """Calculate the profit per lead. - Parameters + Args: y_predicted (pd.Series): Predicted labels y_true (pd.Series): True labels X (pd.DataFrame): Ignored. diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 414c26dc47..1f0f9c05b6 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -63,7 +63,7 @@ def expected_range(cls): def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): """Computes the relative value of the provided predictions compared to the actual labels, according a specified metric. - Parameters + Args: y_predicted (pd.Series): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score @@ -84,7 +84,7 @@ def positive_only(cls): def score(self, y_true, y_predicted, X=None, sample_weight=None): """Returns a numerical score indicating performance based on the differences between the predicted and actual values. - Parameters + Args: y_predicted (pd.Series): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score @@ -106,7 +106,7 @@ def score(self, y_true, y_predicted, X=None, sample_weight=None): def _standardize_input_type(input_data): """Standardize input to pandas for scoring. - Parameters + Args: input_data (list, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities Returns: @@ -126,7 +126,7 @@ def _standardize_input_type(input_data): def validate_inputs(self, y_true, y_predicted): """Validates the input based on a few simple checks. - Parameters + Args: y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples] y_true (pd.Series): Actual class labels of length [n_samples] @@ -156,7 +156,7 @@ def validate_inputs(self, y_true, y_predicted): def calculate_percent_difference(cls, score, baseline_score): """Calculate the percent difference between scores. - Parameters + Args: score (float): A score. Output of the score method of this objective. baseline_score (float): A score. Output of the score method of this objective. In practice, this is the score achieved on this objective with a baseline estimator. diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index 3261e28c14..bf3839f19c 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -18,7 +18,7 @@ class SensitivityLowAlert(BinaryClassificationObjective): def __init__(self, alert_rate=0.01): """Create instance of SensitivityLowAlert. - Parameters + Args: alert_rate (float): percentage of top scores to classify as high risk """ if (alert_rate > 1) or (alert_rate < 0): @@ -29,7 +29,7 @@ def __init__(self, alert_rate=0.01): def decision_function(self, ypred_proba, **kwargs): """Determine if an observation is high risk given an alert rate. - Parameters + Args: ypred_proba (pd.Series): Predicted probabilities """ @@ -48,7 +48,7 @@ def decision_function(self, ypred_proba, **kwargs): def objective_function(self, y_true, y_predicted, **kwargs): """Calculate sensitivity across all predictions, using the top alert_rate percent of observations as the predicted positive class. - Parameters + Args: y_true (pd.Series): True labels y_predicted (pd.Series): Predicted labels based on alert_rate diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index 595dfea212..8a78684f17 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -12,7 +12,7 @@ def get_non_core_objectives(): Non-core objectives are objectives that are domain-specific. Users typically need to configure these objectives before using them in AutoMLSearch. - Returns: + Returns:: List of ObjectiveBase classes """ return [ @@ -43,7 +43,7 @@ def _all_objectives_dict(): def get_all_objective_names(): """Get a list of the names of all objectives. - Returns: + Returns:: list (str): Objective names """ all_objectives_dict = _all_objectives_dict() @@ -66,9 +66,9 @@ def get_core_objective_names(): def get_objective(objective, return_instance=False, **kwargs): - """Returns the Objective class corresponding to a given objective name. + """Returns: the Objective class corresponding to a given objective name. - Parameters + Args: objective (str or ObjectiveBase): Name or instance of the objective class. return_instance (bool): Whether to return an instance of the objective. This only applies if objective is of type str. Note that the instance will be initialized with default arguments. @@ -109,11 +109,11 @@ def get_objective(objective, return_instance=False, **kwargs): def get_core_objectives(problem_type): - """Returns all core objective instances associated with the given problem type. + """Returns: all core objective instances associated with the given problem type. Core objectives are designed to work out-of-the-box for any dataset. - Parameters + Args: problem_type (str/ProblemTypes): Type of problem Returns: diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py index 55467da06e..cc66e2409b 100644 --- a/evalml/pipelines/binary_classification_pipeline.py +++ b/evalml/pipelines/binary_classification_pipeline.py @@ -13,7 +13,7 @@ class BinaryClassificationPipeline( ): """Pipeline subclass for all binary classification pipelines. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -31,11 +31,11 @@ class BinaryClassificationPipeline( def _predict(self, X, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions - Returns: + Returns pd.Series: Estimated labels """ @@ -55,10 +55,10 @@ def _predict(self, X, objective=None): def predict_proba(self, X): """Make probability estimates for labels. Assumes that the column at index 1 represents the positive label case. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - Returns: + Returns pd.Series: Probability estimates """ return super().predict_proba(X) diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 10382cd220..6bb3b4a98c 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -42,7 +42,7 @@ def _select_y_pred_for_score(self, X, y, y_pred, y_pred_proba, objective): def optimize_threshold(self, X, y, y_pred_proba, objective): """Optimize the pipeline threshold given the objective to use. Only used for binary problems with objectives whose thresholds can be tuned. - Parameters + Args: X (pd.DataFrame): Input features y (pd.Series): Input target values y_pred_proba (pd.Series): The predicted probabilities of the target outputted by the pipeline diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 4e8749745b..f1db69d46c 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -8,7 +8,7 @@ class ClassificationPipeline(PipelineBase): """Pipeline subclass for all classification pipelines. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -38,11 +38,11 @@ def __init__( def fit(self, X, y): """Build a classification model. For string and categorical targets, classes are sorted by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training labels of length [n_samples] - Returns: + Returns self """ X = infer_feature_types(X) @@ -80,11 +80,11 @@ def classes_(self): def _predict(self, X, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions - Returns: + Returns pd.Series: Estimated labels """ return self.component_graph.predict(X) @@ -92,11 +92,11 @@ def _predict(self, X, objective=None): def predict(self, X, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions - Returns: + Returns pd.Series: Estimated labels """ predictions = self._predict(X, objective=objective) @@ -108,10 +108,10 @@ def predict(self, X, objective=None): def predict_proba(self, X): """Make probability estimates for labels. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - Returns: + Returns pd.DataFrame: Probability estimates """ if self.estimator is None: @@ -131,12 +131,12 @@ def predict_proba(self, X): def score(self, X, y, objectives): """Evaluate model performance on objectives. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): True labels of length [n_samples] objectives (list): List of objectives to score - Returns: + Returns dict: Ordered dictionary of objective scores """ y = infer_feature_types(y) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index ffa73d137e..9983502e45 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -28,11 +28,11 @@ class ComponentGraph: """Component graph for a pipeline as a directed acyclic graph (DAG). - Parameters + Args: component_dict (dict): A dictionary which specifies the components and edges between components that should be used to create the component graph. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. - Examples + Example: >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']} >>> component_graph = ComponentGraph(component_dict) """ @@ -126,7 +126,7 @@ def compute_order(self): def default_parameters(self): """The default parameter dictionary for this pipeline. - Returns: + Returns dict: Dictionary of all component default parameters. """ defaults = {} @@ -138,7 +138,7 @@ def default_parameters(self): def instantiate(self, parameters): """Instantiates all uninstantiated components within the graph using the given parameters. An error will be raised if a component is already instantiated but the parameters dict contains arguments for that component. - Parameters + Args: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} or None implies using all default values for component parameters. If a component in the component graph is already instantiated, it will not use any of its parameters defined in this dictionary. @@ -176,7 +176,7 @@ def instantiate(self, parameters): def fit(self, X, y): """Fit each component in the graph. - Parameters + Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. """ @@ -189,11 +189,11 @@ def fit(self, X, y): def fit_features(self, X, y): """Fit all components save the final one, usually an estimator. - Parameters + Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. - Returns: + Returns pd.DataFrame: Transformed values. """ return self._fit_transform_features_helper(True, X, y) @@ -201,11 +201,11 @@ def fit_features(self, X, y): def compute_final_component_features(self, X, y=None): """Transform all components save the final one, and gathers the data from any number of parents to get all the information that should be fed to the final component. - Parameters + Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. - Returns: + Returns pd.DataFrame: Transformed values. """ return self._fit_transform_features_helper(False, X, y) @@ -213,12 +213,12 @@ def compute_final_component_features(self, X, y=None): def _fit_transform_features_helper(self, needs_fitting, X, y=None): """Transform all components save the final one, and returns the data that should be fed to the final component, usually an estimator. - Parameters + Args: needs_fitting (boolean): Determines if components should be fit. X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. - Returns: + Returns pd.DataFrame: Transformed values. """ if len(self.compute_order) <= 1: @@ -260,11 +260,11 @@ def _consolidate_inputs_for_component( def transform(self, X, y=None): """Transform the input using the component graph. - Parameters + Args: X (pd.DataFrame): Input features of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. - Returns: + Returns pd.DataFrame: Transformed output. """ if len(self.compute_order) == 0: @@ -286,10 +286,10 @@ def transform(self, X, y=None): def predict(self, X): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame): Input features of shape [n_samples, n_features]. - Returns: + Returns pd.Series: Predicted values. """ if len(self.compute_order) == 0: @@ -306,14 +306,14 @@ def predict(self, X): def _compute_features(self, component_list, X, y=None, fit=False): """Transforms the data by applying the given components. - Parameters + Args: component_list (list): The list of component names to compute. X (pd.DataFrame): Input data to the pipeline to transform. y (pd.Series): The target training data of length [n_samples]. fit (boolean): Whether to fit the estimators as well as transform it. Defaults to False. - Returns: + Returns dict: Outputs from each component. """ X = infer_feature_types(X) @@ -387,10 +387,10 @@ def _get_feature_provenance(self, input_feature_names): If a feature is then calculated from feature 'a', e.g. 'a_squared', then the provenance would instead be {'cats': ['a', 'a_squared', 'b']}. - Parameters + Args: input_feature_names (list(str)): Names of the features in the input dataframe. - Returns: + Returns dictionary: mapping of feature name to set feature names that were created from that feature. """ if not self.compute_order: @@ -442,10 +442,10 @@ def _get_feature_provenance(self, input_feature_names): def get_component(self, component_name): """Retrieves a single component object from the graph. - Parameters + Args: component_name (str): Name of the component to retrieve - Returns: + Returns ComponentBase object """ try: @@ -456,7 +456,7 @@ def get_component(self, component_name): def get_last_component(self): """Retrieves the component that is computed last in the graph, usually the final estimator. - Returns: + Returns ComponentBase object """ if len(self.compute_order) == 0: @@ -467,7 +467,7 @@ def get_last_component(self): def get_estimators(self): """Gets a list of all the estimator components within this graph. - Returns: + Returns list: All estimator objects within the graph. """ if not isinstance(self.get_last_component(), ComponentBase): @@ -483,10 +483,10 @@ def get_estimators(self): def get_inputs(self, component_name): """Retrieves all inputs for a given component. - Parameters + Args: component_name (str): Name of the component to look up. - Returns: + Returns list[str]: List of inputs for the component to use. """ try: @@ -500,10 +500,10 @@ def get_inputs(self, component_name): def describe(self, return_dict=False): """Outputs component graph details including component parameters. - Parameters + Args: return_dict (bool): If True, return dictionary of information about component graph. Defaults to False. - Returns: + Returns dict: Dictionary of all component parameters if return_dict is True, else None """ components = {} @@ -523,11 +523,11 @@ def describe(self, return_dict=False): def graph(self, name=None, graph_format=None): """Generate an image representing the component graph. - Parameters + Args: name (str): Name of the graph. Defaults to None. graph_format (str): file format to save the graph in. Defaults to None. - Returns: + Returns graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ graphviz = import_or_raise( @@ -621,7 +621,7 @@ def __iter__(self): def __next__(self): """Iterator for graphs, retrieves the components in the graph in order. - Returns: + Returns ComponentBase obj: The next component class or instance in the graph """ if self._i < len(self.compute_order): @@ -682,7 +682,7 @@ def inverse_transform(self, y): Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). - Parameters + Args: y: (pd.Series): Final component features """ data_to_transform = infer_feature_types(y) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index bd346428af..a59509b233 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -19,7 +19,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): """Base class for all components. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -30,7 +30,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -94,7 +94,7 @@ def default_parameters(cls): Our convention is that Component.default_parameters == Component().parameters. - Returns: + Returns dict: default parameters for this component. """ @@ -110,7 +110,7 @@ def _supported_by_list_API(cls): def clone(self): """Constructs a new component with the same parameters and random state. - Returns: + Returns A new instance of this component with identical parameters and random state. """ return self.__class__(**self.parameters, random_seed=self.random_seed) @@ -118,11 +118,11 @@ def clone(self): def fit(self, X, y=None): """Fits component to data. - Parameters + Args: X (list, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (list, pd.Series, np.ndarray, optional): The target training data of length [n_samples] - Returns: + Returns self """ X = infer_feature_types(X) @@ -139,11 +139,11 @@ def fit(self, X, y=None): def describe(self, print_name=False, return_dict=False): """Describe a component and its parameters. - Parameters + Args: print_name(bool, optional): whether to print name of component return_dict(bool, optional): whether to return description as dictionary in the format {"name": name, "parameters": parameters} - Returns: + Returns None or dict: prints and returns dictionary """ if print_name: @@ -162,11 +162,11 @@ def describe(self, print_name=False, return_dict=False): def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves component at file path. - Parameters + Args: file_path (str): Location to save file pickle_protocol (int): The pickle data stream format. - Returns: + Returns None """ with open(file_path, "wb") as f: @@ -176,10 +176,10 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): def load(file_path): """Loads component at file path. - Parameters + Args: file_path (str): Location to load file - Returns: + Returns ComponentBase object """ with open(file_path, "rb") as f: diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index e780b7aaf8..06091b82fb 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleBase(Estimator): """Stacked Ensemble Base Class. - Parameters + Args: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The estimator used to combine the base estimators. @@ -103,7 +103,7 @@ def feature_importance(self): def default_parameters(cls): """Returns the default parameters for stacked ensemble classes. - Returns: + Returns dict: default parameters for this component. """ return { diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py index 82a6c02d03..9ce2d90647 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleClassifier(SklearnStackedEnsembleBase): """Scikit-learn Stacked Ensemble Classifier. - Parameters + Args: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The classifier used to combine the base estimators. If None, uses LogisticRegressionClassifier. diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py index 772a486daf..37f8078b9e 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py @@ -10,7 +10,7 @@ class SklearnStackedEnsembleRegressor(SklearnStackedEnsembleBase): """Scikit-learn Stacked Ensemble Regressor. - Parameters + Args: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. final_estimator (Estimator or subclass): The regressor used to combine the base estimators. If None, uses LinearRegressor. diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 732f1037f8..1aed5552b5 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -12,7 +12,7 @@ class BaselineClassifier(Estimator): This is useful as a simple baseline classifier to compare with other classifiers. - Parameters + Args: strategy (str): Method used to predict. Valid options are "mode", "random" and "random_weighted". Defaults to "mode". random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -96,7 +96,7 @@ def predict_proba(self, X): def feature_importance(self): """Returns importance associated with each feature. Since baseline classifiers do not use input features to calculate predictions, returns an array of zeroes. - Returns: + Returns np.ndarray (float): An array of zeroes """ return np.zeros(self._num_features) @@ -105,7 +105,7 @@ def feature_importance(self): def classes_(self): """Returns class labels. Will return None before fitting. - Returns: + Returns list[str] or list(float) : Class names """ return self._classes diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index 6c30212e10..f6dc0e0ae9 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -17,7 +17,7 @@ class CatBoostClassifier(Estimator): For more information, check out https://catboost.ai/ - Parameters + Args: n_estimators (float): The maximum number of trees to build. Defaults to 10. eta (float): The learning rate. Defaults to 0.03. max_depth (int): The maximum tree depth for base learners. Defaults to 6. diff --git a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py index 9b6821561b..38994b77d1 100644 --- a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py @@ -9,7 +9,7 @@ class DecisionTreeClassifier(Estimator): """Decision Tree Classifier. - Parameters + Args: criterion ({"gini", "entropy"}): The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Defaults to "gini". diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py index 77ef28c904..99abb89f47 100644 --- a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py @@ -12,7 +12,7 @@ class ElasticNetClassifier(Estimator): """Elastic Net Classifier. Uses Logistic Regression with elasticnet penalty as the base estimator. - Parameters + Args: penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "elasticnet". C (float): Inverse of regularization strength. Must be a positive float. Defaults to 1.0. l1_ratio (float): The mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'. Setting l1_ratio=0 is equivalent to using penalty='l2', diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py index abed596522..c623051754 100644 --- a/evalml/pipelines/components/estimators/classifiers/et_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/et_classifier.py @@ -9,7 +9,7 @@ class ExtraTreesClassifier(Estimator): """Extra Trees Classifier. - Parameters + Args: n_estimators (float): The number of trees in the forest. Defaults to 100. max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split: diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py index dec76cd267..89e9321488 100644 --- a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py @@ -10,7 +10,7 @@ class KNeighborsClassifier(Estimator): """K-Nearest Neighbors Classifier. - Parameters + Args: n_neighbors (int): Number of neighbors to use by default. Defaults to 5. weights ({‘uniform’, ‘distance’} or callable): Weight function used in prediction. Can be: diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index a000ae3c7c..6328e0e9bb 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -20,7 +20,7 @@ class LightGBMClassifier(Estimator): """LightGBM Classifier. - Parameters + Args: boosting_type (string): Type of boosting to use. Defaults to "gbdt". - 'gbdt' uses traditional Gradient Boosting Decision Tree - "dart", uses Dropouts meet Multiple Additive Regression Trees diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py index 5bbbd603f8..6bc45859a0 100644 --- a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py @@ -10,7 +10,7 @@ class LogisticRegressionClassifier(Estimator): """Logistic Regression Classifier. - Parameters + Args: penalty ({"l1", "l2", "elasticnet", "none"}): The norm used in penalization. Defaults to "l2". C (float): Inverse of regularization strength. Must be a positive float. Defaults to 1.0. multi_class ({"auto", "ovr", "multinomial"}): If the option chosen is "ovr", then a binary problem is fit for each label. diff --git a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py index c1667a8c2f..23ae50d2db 100644 --- a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py @@ -9,7 +9,7 @@ class RandomForestClassifier(Estimator): """Random Forest Classifier. - Parameters + Args: n_estimators (float): The number of trees in the forest. Defaults to 100. max_depth (int): Maximum tree depth for base learners. Defaults to 6. n_jobs (int or None): Number of jobs to run in parallel. -1 uses all processes. Defaults to -1. diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index 797fbb83a6..c7fcfb0830 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -10,7 +10,7 @@ class SVMClassifier(Estimator): """Support Vector Machine Classifier. - Parameters + Args: C (float): The regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. Defaults to 1.0. kernel ({"poly", "rbf", "sigmoid"}): Specifies the kernel type to be used in the algorithm. Defaults to "rbf". diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index e30550260f..20b20511cf 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -12,7 +12,7 @@ class XGBoostClassifier(Estimator): """XGBoost Classifier. - Parameters + Args: eta (float): Boosting learning rate. Defaults to 0.1. max_depth (int): Maximum tree depth for base learners. Defaults to 6. min_child_weight (float): Minimum sum of instance weight (hessian) needed in a child. Defaults to 1.0 diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 692eb59f36..fb98f5c4dd 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -19,7 +19,7 @@ class Estimator(ComponentBase): To see some examples, check out the definitions of any Estimator component. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -65,10 +65,10 @@ def fit(self, X, y=None): def predict(self, X): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame, np.ndarray): Data of shape [n_samples, n_features] - Returns: + Returns pd.Series: Predicted values """ try: @@ -85,10 +85,10 @@ def predict(self, X): def predict_proba(self, X): """Make probability estimates for labels. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Features - Returns: + Returns pd.Series: Probability estimates """ try: @@ -104,7 +104,7 @@ def predict_proba(self, X): def feature_importance(self): """Returns importance associated with each feature. - Returns: + Returns np.ndarray: Importance associated with each feature """ try: diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 48d4619539..750033084d 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -13,7 +13,7 @@ class ARIMARegressor(Estimator): Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI. - Parameters + Args: date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term, 't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index 400d6d2841..0d9bb03889 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -10,7 +10,7 @@ class BaselineRegressor(Estimator): """Baseline regressor that uses a simple strategy to make predictions. This is useful as a simple baseline regressor to compare with other regressors. - Parameters + Args: strategy (str): Method used to predict. Valid options are "mean", "median". Defaults to "mean". random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -65,7 +65,7 @@ def predict(self, X): def feature_importance(self): """Returns importance associated with each feature. Since baseline regressors do not use input features to calculate predictions, returns an array of zeroes. - Returns: + Returns np.ndarray (float): An array of zeroes """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index 79a76a5cf1..d795e7f115 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -14,7 +14,7 @@ class CatBoostRegressor(Estimator): For more information, check out https://catboost.ai/ - Parameters + Args: n_estimators (float): The maximum number of trees to build. Defaults to 10. eta (float): The learning rate. Defaults to 0.03. max_depth (int): The maximum tree depth for base learners. Defaults to 6. diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index 9b4d95e47d..4562bdb2f5 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -9,7 +9,7 @@ class DecisionTreeRegressor(Estimator): """Decision Tree Regressor. - Parameters + Args: criterion ({"mse", "friedman_mse", "mae", "poisson"}): The function to measure the quality of a split. Supported criteria are: diff --git a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py index f54c772cbf..b20a86e05d 100644 --- a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py @@ -9,7 +9,7 @@ class ElasticNetRegressor(Estimator): """Elastic Net Regressor. - Parameters + Args: alpha (float): Constant that multiplies the penalty terms. Defaults to 0.0001. l1_ratio (float): The mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'. Setting l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'. For 0 < l1_ratio <1, the penalty is a combination of L1 and L2. Defaults to 0.15. diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py index fa28354aec..fd8dac3609 100644 --- a/evalml/pipelines/components/estimators/regressors/et_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/et_regressor.py @@ -9,7 +9,7 @@ class ExtraTreesRegressor(Estimator): """Extra Trees Regressor. - Parameters + Args: n_estimators (float): The number of trees in the forest. Defaults to 100. max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split: diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index 37bfe0163d..c230e0d633 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -18,7 +18,7 @@ class LightGBMRegressor(Estimator): """LightGBM Regressor. - Parameters + Args: boosting_type (string): Type of boosting to use. Defaults to "gbdt". - 'gbdt' uses traditional Gradient Boosting Decision Tree - "dart", uses Dropouts meet Multiple Additive Regression Trees diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index f8f9c5fe82..2d2fa05ec4 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -8,7 +8,7 @@ class LinearRegressor(Estimator): """Linear Regressor. - Parameters + Args: fit_intercept (boolean): Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). Defaults to True. diff --git a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py index b013f6c884..de82806392 100644 --- a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py @@ -147,7 +147,7 @@ def default_parameters(cls): Our convention is that Component.default_parameters == Component().parameters. - Returns: + Returns dict: default parameters for this component. """ diff --git a/evalml/pipelines/components/estimators/regressors/rf_regressor.py b/evalml/pipelines/components/estimators/regressors/rf_regressor.py index 14e664d544..867c761115 100644 --- a/evalml/pipelines/components/estimators/regressors/rf_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/rf_regressor.py @@ -9,7 +9,7 @@ class RandomForestRegressor(Estimator): """Random Forest Regressor. - Parameters + Args: n_estimators (float): The number of trees in the forest. Defaults to 100. max_depth (int): Maximum tree depth for base learners. Defaults to 6. n_jobs (int or None): Number of jobs to run in parallel. -1 uses all processes. Defaults to -1. diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index 49cb51ca03..aaca1f1487 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -10,7 +10,7 @@ class SVMRegressor(Estimator): """Support Vector Machine Regressor. - Parameters + Args: C (float): The regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. Defaults to 1.0. kernel ({"poly", "rbf", "sigmoid"}): Specifies the kernel type to be used in the algorithm. Defaults to "rbf". diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index b9903fc729..07fd120d3f 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -12,7 +12,7 @@ class TimeSeriesBaselineEstimator(Estimator): This is useful as a simple baseline estimator for time series problems. - Parameters + Args: gap (int): Gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period. Defaults to 1. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -87,7 +87,7 @@ def feature_importance(self): Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes. - Returns: + Returns np.ndarray (float): an array of zeroes """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index 51c70a20f6..2fca4af0a9 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -12,7 +12,7 @@ class XGBoostRegressor(Estimator): """XGBoost Regressor. - Parameters + Args: eta (float): Boosting learning rate. Defaults to 0.1. max_depth (int): Maximum tree depth for base learners. Defaults to 6. min_child_weight (float): Minimum sum of instance weight (hessian) needed in a child. Defaults to 1.0 diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index d457d71dff..9f85dc1b76 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -7,7 +7,7 @@ class ColumnSelector(Transformer): """Initalizes an transformer that drops specified columns in input data. - Parameters + Args: columns (list(string)): List of column names, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -39,11 +39,11 @@ def _modify_columns(self, cols, X, y=None): def fit(self, X, y=None): """Fits the transformer by checking if column names are present in the dataset. - Parameters + Args: X (pd.DataFrame): Data to check. y (pd.Series, optional): Targets. - Returns: + Returns self """ X = infer_feature_types(X) @@ -61,7 +61,7 @@ def transform(self, X, y=None): class DropColumns(ColumnSelector): """Drops specified columns in input data. - Parameters + Args: columns (list(string)): List of column names, used to determine which columns to drop. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -77,11 +77,11 @@ def _modify_columns(self, cols, X, y=None): def transform(self, X, y=None): """Transforms data X by dropping columns. - Parameters + Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. - Returns: + Returns pd.DataFrame: Transformed X. """ return super().transform(X, y) @@ -90,7 +90,7 @@ def transform(self, X, y=None): class SelectColumns(ColumnSelector): """Selects specified columns in input data. - Parameters + Args: columns (list(string)): List of column names, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -106,11 +106,11 @@ def _modify_columns(self, cols, X, y=None): def transform(self, X, y=None): """Transforms data X by selecting columns. - Parameters + Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. - Returns: + Returns pd.DataFrame: Transformed X. """ return super().transform(X, y) @@ -119,7 +119,7 @@ def transform(self, X, y=None): class SelectByType(ColumnSelector): """Selects columns by specified Woodwork logical type or semantic tag in input data. - Parameters + Args: column_types (string, ww.LogicalType, list(string), list(ww.LogicalType)): List of Woodwork types or tags, used to determine which columns to select. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index 2af4a8cf46..40026f48bb 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -12,7 +12,7 @@ class LinearDiscriminantAnalysis(Transformer): """Reduces the number of features by using Linear Discriminant Analysis. - Parameters + Args: n_components (int): The number of features to maintain after computation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 96cf0b6a10..4a7ed43662 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -13,7 +13,7 @@ class PCA(Transformer): """Reduces the number of features by using Principal Component Analysis (PCA). - Parameters + Args: variance (float): The percentage of the original data variance that should be preserved when reducing the number of features. Defaults to 0.95. n_components (int): The number of features to maintain after computing SVD. Defaults to None, but will override diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 1dc3aeaee6..76d37bc586 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -20,7 +20,7 @@ class OneHotEncoderMeta(ComponentBaseMeta): class OneHotEncoder(Transformer, metaclass=OneHotEncoderMeta): """A transformer that encodes categorical features in a one-hot numeric array. - Parameters + Args: top_n (int): Number of categories per column to encode. If None, all categories will be encoded. Otherwise, the `n` most frequent will be encoded and all others will be dropped. Defaults to 10. features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. @@ -158,11 +158,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """One-hot encode the input data. - Parameters + Args: X (pd.DataFrame): Features to one-hot encode. y (pd.Series): Ignored. - Returns: + Returns pd.DataFrame: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ X = infer_feature_types(X) @@ -201,9 +201,9 @@ def _handle_parameter_handle_missing(self, X): def categories(self, feature_name): """Returns a list of the unique categories to be encoded for the particular feature, in order. - Parameters + Args: feature_name (str): the name of any feature provided to one-hot encoder during fit - Returns: + Returns np.ndarray: the unique categories, in the same dtype as they were provided during fit """ try: @@ -238,7 +238,7 @@ def _get_feature_names(self): For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". - Returns: + Returns np.ndarray: The feature names after encoding, provided in the same order as input_features. """ self._features_to_drop = [] @@ -285,7 +285,7 @@ def get_feature_names(self): For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". - Returns: + Returns np.ndarray: The feature names after encoding, provided in the same order as input_features. """ feature_names = self._get_feature_names() diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index 520823584e..d16e4d437c 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -16,7 +16,7 @@ class TargetEncoder(Transformer, metaclass=OneHotEncoderMeta): """A transformer that encodes categorical features into target encodings. - Parameters + Args: cols (list): Columns to encode. If None, all string columns will be encoded, otherwise only the columns provided will be encoded. Defaults to None smoothing (float): The smoothing factor to apply. The larger this value is, the more influence the expected target value has @@ -94,7 +94,7 @@ def fit_transform(self, X, y): def get_feature_names(self): """Return feature names for the input features after fitting. - Returns: + Returns np.array: The feature names after encoding """ return self._component_obj.get_feature_names() diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index 76971a509a..49557214f9 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -11,7 +11,7 @@ class FeatureSelector(Transformer): """Selects top features based on importance weights. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -20,7 +20,7 @@ class FeatureSelector(Transformer): def get_names(self): """Get names of selected features. - Returns: + Returns list[str]: List of the names of features selected """ selected_masks = self._component_obj.get_support() @@ -35,11 +35,11 @@ def get_names(self): def transform(self, X, y=None): """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception. - Parameters + Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py index 245c19c418..b8917c470e 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py @@ -9,7 +9,7 @@ class RFClassifierSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest classifier. - Parameters + Args: number_features (int): The maximum number of features to select. If both percent_features and number_features are specified, take the greater number of features. Defaults to 0.5. Defaults to None. diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py index 7dff11068a..9462447c91 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py @@ -9,7 +9,7 @@ class RFRegressorSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest regressor. - Parameters + Args: number_features (int): The maximum number of features to select. If both percent_features and number_features are specified, take the greater number of features. Defaults to 0.5. Defaults to None. diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index 28f57f1ed3..4a74ba46cf 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -11,7 +11,7 @@ class Imputer(Transformer): """Imputes missing data according to a specified imputation strategy. - Parameters + Args: categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "most_frequent" and "constant". numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include "mean", "median", "most_frequent", and "constant". categorical_fill_value (string): When categorical_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with the string "missing_value". @@ -78,11 +78,11 @@ def __init__( def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. - Parameters + Args: X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] - Returns: + Returns self """ X = infer_feature_types(X) @@ -108,11 +108,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. - Parameters + Args: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index 8effc33331..7beff86db1 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -11,7 +11,7 @@ class PerColumnImputer(Transformer): """Imputes missing data according to a specified imputation strategy per column. - Parameters + Args: impute_strategies (dict): Column and {"impute_strategy": strategy, "fill_value":value} pairings. Valid values for impute strategy include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to None, which uses "most_frequent" for all columns. @@ -54,11 +54,11 @@ def __init__( def fit(self, X, y=None): """Fits imputers on input data. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. - Returns: + Returns self """ X = infer_feature_types(X) @@ -81,11 +81,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms input data by imputing missing values. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 57777feacd..989195ec81 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -12,7 +12,7 @@ class SimpleImputer(Transformer): """Imputes missing data according to a specified imputation strategy. - Parameters + Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. @@ -40,11 +40,11 @@ def __init__( def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. - Parameters + Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] - Returns: + Returns self """ X = infer_feature_types(X) @@ -71,11 +71,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. - Parameters + Args: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X = infer_feature_types(X) @@ -107,11 +107,11 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Fits on X and transforms X. - Parameters + Args: X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. - Returns: + Returns pd.DataFrame: Transformed X """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index 05899b6b32..b64ad3f065 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -39,7 +39,7 @@ def _check_for_fit(self, X=None, y=None): class TargetImputer(Transformer, metaclass=TargetImputerMeta): """Imputes missing target data according to a specified imputation strategy. - Parameters + Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent". fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. @@ -68,11 +68,11 @@ def __init__( def fit(self, X, y): """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. y (pd.Series, optional): The target training data of length [n_samples]. - Returns: + Returns self """ if y is None: @@ -92,11 +92,11 @@ def fit(self, X, y): def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. - Parameters + Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. - Returns: + Returns (pd.DataFrame, pd.Series): The original X, transformed y """ @@ -120,11 +120,11 @@ def transform(self, X, y): def fit_transform(self, X, y): """Fits on and transforms the input target data. - Parameters + Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. - Returns: + Returns (pd.DataFrame, pd.Series): The original X, transformed y """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index e7c43201c5..82aaffc062 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -58,7 +58,7 @@ def _extract_hour(col, encode_as_categories=False): class DateTimeFeaturizer(Transformer): """Transformer that can automatically extract features from datetime columns. - Parameters + Args: features_to_extract (list): List of features to extract. Valid options include "year", "month", "day_of_week", "hour". Defaults to None. encode_as_categories (bool): Whether day-of-week and month features should be encoded as pandas "category" dtype. This allows OneHotEncoders to encode these features. Defaults to False. @@ -119,11 +119,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns. - Parameters + Args: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X = infer_feature_types(X) @@ -146,7 +146,7 @@ def transform(self, X, y=None): def get_feature_names(self): """Gets the categories of each datetime feature. - Returns: + Returns Dictionary, where each key-value pair is a column name and a dictionary mapping the unique feature values to their integer encoding. """ diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index 82421fea68..a255b58c60 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -9,7 +9,7 @@ class DelayedFeatureTransformer(Transformer): """Transformer that delays input features and target variable for time series problems. - Parameters + Args: date_index (str): Name of the column containing the datetime information used to order the data. Ignored. max_delay (int): Maximum number of time units to delay each feature. Defaults to 2. delay_features (bool): Whether to delay the input features. Defaults to True. @@ -57,11 +57,11 @@ def __init__( def fit(self, X, y=None): """Fits the DelayFeatureTransformer. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] - Returns: + Returns self """ return self @@ -98,11 +98,11 @@ def transform(self, X, y=None): If y is not None, it will also compute the delayed values for the target variable. - Parameters + Args: X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. - Returns: + Returns pd.DataFrame: Transformed X. """ if X is None: diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index cf7c778783..252cedde3d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -5,7 +5,7 @@ class DropNullColumns(Transformer): """Transformer to drop features whose percentage of NaN values exceeds a specified threshold. - Parameters + Args: pct_null_threshold(float): The percentage of NaN values in an input feature to drop. Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values. If equal to 1.0, will drop columns with all null values. Defaults to 0.95. @@ -43,11 +43,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by dropping columns that exceed the threshold of null values. - Parameters + Args: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns: + Returns pd.DataFrame: Transformed X """ X_t = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py index b93eeb43a2..7f98f4542e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py @@ -5,7 +5,7 @@ class DropRowsTransformer(Transformer): """Transformer to drop rows specified by row indices. - Parameters + Args: indices_to_drop (list): List of indices to drop in the input data. Defaults to None. random_seed (int): Seed for the random number generator. Is not used by this component. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index e875af12ab..6eb0d7ddb0 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -10,7 +10,7 @@ class DFSTransformer(Transformer): """Featuretools DFS component that generates features for the input features. - Parameters + Args: index (string): The name of the column that contains the indices. If no column with this name exists, then featuretools.EntitySet() creates a column with this name to serve as the index column. Defaults to 'index'. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -46,11 +46,11 @@ def _make_entity_set(self, X): def fit(self, X, y=None): """Fits the DFSTransformer Transformer component. - Parameters + Args: X (pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features] y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] - Returns: + Returns self """ X_ww = infer_feature_types(X) @@ -64,11 +64,11 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Computes the feature matrix for the input X using featuretools' dfs algorithm. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data to transform. Has shape [n_samples, n_features] y (pd.Series, optional): Ignored. - Returns: + Returns pd.DataFrame: Feature matrix """ X_ww = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py index d7e6bc9026..35523ab29e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py @@ -20,7 +20,7 @@ def __init__(self, random_seed=0): def fit(self, X, y=None): """Fits the LogTransformer. - Parameters + Args: X (pd.DataFrame or np.ndarray): Ignored. y (pd.Series, optional): Ignored. @@ -32,7 +32,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Log transforms the target variable. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target data to log transform. @@ -52,7 +52,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Log transforms the target variable. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to log transform. diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index b0a37f16bd..18bd97bc52 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -12,7 +12,7 @@ class LSA(TextTransformer): """Transformer to calculate the Latent Semantic Analysis Values of text input. - Parameters + Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -42,7 +42,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Transforms data X by applying the LSA pipeline. - Parameters + Args: X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index b811a675ab..91dae0769f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -10,7 +10,7 @@ class PolynomialDetrender(TargetTransformer): """Removes trends from time series by fitting a polynomial to the data. - Parameters + Args: degree (int): Degree for the polynomial. If 1, linear model is fit to the data. If 2, quadratic model is fit, etc. Defaults to 1. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -49,7 +49,7 @@ def __init__(self, degree=1, random_seed=0, **kwargs): def fit(self, X, y=None): """Fits the PolynomialDetrender. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -65,7 +65,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Removes fitted trend from target variable. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -83,7 +83,7 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Removes fitted trend from target variable. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable to detrend. @@ -96,7 +96,7 @@ def fit_transform(self, X, y=None): def inverse_transform(self, y): """Adds back fitted trend to target variable. - Parameters + Args: X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable. diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 6d54523ff4..7ad27453c6 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -19,7 +19,7 @@ class TextFeaturizer(TextTransformer): LSA (Latent Semantic Analysis). Calling transform on this component will replace any text columns in the given dataset with these numeric columns. - Parameters + Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -75,7 +75,7 @@ def _make_entity_set(self, X, text_columns): def fit(self, X, y=None): """Fits component to data. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] @@ -115,7 +115,7 @@ def _get_primitives_provenance(features): def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns. - Parameters + Args: X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. diff --git a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py index cd83db602a..903333a4ed 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py @@ -7,7 +7,7 @@ class TextTransformer(Transformer): """Base class for all transformers working with text features. - Parameters + Args: component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py index 3046102a8e..537747afe2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py +++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py @@ -112,7 +112,7 @@ def _get_feature_provenance(self): class EmailFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from emails. - Parameters + Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -135,7 +135,7 @@ def _get_feature_types_for_featuretools(self, X): class URLFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from URL. - Parameters + Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 9a6b4d26f7..d7e68a9454 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -10,7 +10,7 @@ class BaseSampler(Transformer): """Base Sampler component. Used as the base class of all sampler components. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -22,7 +22,7 @@ class BaseSampler(Transformer): def fit(self, X, y): """Fits the sampler to the data. - Parameters + Args: X (pd.DataFrame): Input features. y (pd.Series): Target. @@ -39,7 +39,7 @@ def fit(self, X, y): def _initialize_sampler(self, X, y): """Helper function to initialize the sampler component object. - Parameters + Args: X (pd.DataFrame): Features. y (pd.Series): The target data. """ @@ -47,7 +47,7 @@ def _initialize_sampler(self, X, y): def _prepare_data(self, X, y): """Transforms the input data to pandas data structure that our sampler can ingest. - Parameters + Args: X (pd.DataFrame): Training features. y (pd.Series): Target. @@ -63,7 +63,7 @@ def _prepare_data(self, X, y): def transform(self, X, y=None): """Transforms the input data by sampling the data. - Parameters + Args: X (pd.DataFrame): Training features. y (pd.Series): Target. @@ -77,7 +77,7 @@ def transform(self, X, y=None): def _convert_dictionary(self, sampling_dict, y): """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. - Parameters + Args: sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. @@ -111,7 +111,7 @@ def _convert_dictionary(self, sampling_dict, y): def _dictionary_to_params(self, sampling_dict, y): """If a sampling ratio dictionary is provided, add the updated sampling dictionary to the parameters and return the updated parameter dictionary. Otherwise, simply return the current parameters. - Parameters + Args: sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. @@ -133,7 +133,7 @@ def fit_transform(self, X, y): class BaseOversampler(BaseSampler): """Base Oversampler component. Used as the base class of all imbalance-learn oversampler components. - Parameters + Args: sampler (obj): Sampler object to use. sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class @@ -176,7 +176,7 @@ def __init__( def _initialize_sampler(self, X, y): """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. If a sampler_ratio_dict is provided, we will opt to use that. Otherwise, we use will create the sampler_ratio_dict dictionary. - Parameters + Args: X (pd.DataFrame): Input features. y (pd.Series): Target. """ diff --git a/evalml/pipelines/components/transformers/samplers/oversamplers.py b/evalml/pipelines/components/transformers/samplers/oversamplers.py index 4efb508ebc..d037a1582e 100644 --- a/evalml/pipelines/components/transformers/samplers/oversamplers.py +++ b/evalml/pipelines/components/transformers/samplers/oversamplers.py @@ -7,7 +7,7 @@ class SMOTEOversampler(BaseOversampler): """SMOTE Oversampler component. Works on numerical datasets only. This component is only run during training and not during predict. - Parameters + Args: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. @@ -42,7 +42,7 @@ def __init__( class SMOTENCOversampler(BaseOversampler): """SMOTENC Oversampler component. Uses SMOTENC to generate synthetic samples. Works on a mix of numerical and categorical columns. Input data must be Woodwork type, and this component is only run during training and not during predict. - Parameters + Args: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. @@ -93,7 +93,7 @@ def fit(self, X, y): class SMOTENOversampler(BaseOversampler): """SMOTEN Oversampler component. Uses SMOTEN to generate synthetic samples. Works for purely categorical datasets. This component is only run during training and not during predict. - Parameters + Args: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index 39f30d2893..d0aa434e13 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -13,7 +13,7 @@ class Undersampler(BaseSampler): This component is only run during training and not during predict. - Parameters + Args: sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. sampling_ratio_dict (dict): A dictionary specifying the desired balanced ratio for each target value. For instance, in a binary case where class 1 is the minority, we could specify: @@ -55,7 +55,7 @@ def __init__( def _initialize_sampler(self, X, y): """Helper function to initialize the undersampler component object. - Parameters + Args: y (pd.Series): The target data """ param_dic = self._dictionary_to_params( diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index ae8f2c544e..02441cc2be 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -12,7 +12,7 @@ class StandardScaler(Transformer): """A transformer that standardizes input features by removing the mean and scaling to unit variance. - Parameters + Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index c8b6c55d24..4d1ec69103 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -22,7 +22,7 @@ class Transformer(ComponentBase): To see some examples, check out the definitions of any Transformer component. - Parameters + Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -36,11 +36,11 @@ class Transformer(ComponentBase): def transform(self, X, y=None): """Transforms data X. - Parameters + Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. - Returns: + Returns pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) @@ -60,11 +60,11 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): """Fits on X and transforms X. - Parameters + Args: X (pd.DataFrame): Data to fit and transform y (pd.Series): Target data - Returns: + Returns pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) @@ -95,9 +95,9 @@ class TargetTransformer(Transformer): def inverse_transform(self, y): """Inverts the transformation done by the transform method. - Parameters + Args: y (pd.Series): Target transformed by this component. - Returns: + Returns pd.Seriesø: Target without the transformation. """ diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 0206a82eab..6aafb5b261 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -34,10 +34,10 @@ def all_components(): def allowed_model_families(problem_type): """List the model types allowed for a particular problem type. - Parameters + Args: problem_types (ProblemTypes or str): binary, multiclass, or regression - Returns: + Returns list[ModelFamily]: a list of model families """ @@ -58,11 +58,11 @@ def get_estimators(problem_type, model_families=None): Can also optionally filter by a list of model types. - Parameters + Args: problem_type (ProblemTypes or str): problem type to filter for model_families (list[ModelFamily] or list[str]): model families to filter for - Returns: + Returns list[class]: a list of estimator subclasses """ if model_families is not None and not isinstance(model_families, list): @@ -102,10 +102,10 @@ def handle_component_class(component_class): return a new instance. Otherwise if a ComponentBase subclass or Component instance is provided, will return that without modification. - Parameters + Args: component (str, ComponentBase): input to be standardized - Returns: + Returns ComponentBase """ if isinstance(component_class, ComponentBase) or ( @@ -133,7 +133,7 @@ class WrappedSKClassifier(BaseEstimator, ClassifierMixin): def __init__(self, pipeline): """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline. - Parameters + Args: pipeline (PipelineBase or subclass obj): EvalML pipeline """ self.pipeline = pipeline @@ -146,11 +146,11 @@ def __init__(self, pipeline): def fit(self, X, y): """Fits component to data. - Parameters + Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] - Returns: + Returns self """ self.classes_ = unique_labels(y) @@ -163,10 +163,10 @@ def fit(self, X, y): def predict(self, X): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame): Features - Returns: + Returns np.ndarray: Predicted values """ check_is_fitted(self, "is_fitted_") @@ -176,10 +176,10 @@ def predict(self, X): def predict_proba(self, X): """Make probability estimates for labels. - Parameters + Args: X (pd.DataFrame): Features - Returns: + Returns np.ndarray: Probability estimates """ return self.pipeline.predict_proba(X).to_numpy() @@ -191,7 +191,7 @@ class WrappedSKRegressor(BaseEstimator, RegressorMixin): def __init__(self, pipeline): """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline. - Parameters + Args: pipeline (PipelineBase or subclass obj): EvalML pipeline """ self.pipeline = pipeline @@ -203,11 +203,11 @@ def __init__(self, pipeline): def fit(self, X, y): """Fits component to data. - Parameters + Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] - Returns: + Returns self """ self.pipeline.fit(X, y) @@ -216,10 +216,10 @@ def fit(self, X, y): def predict(self, X): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame): Features - Returns: + Returns np.ndarray: Predicted values """ return self.pipeline.predict(X).to_numpy() @@ -260,10 +260,10 @@ def scikit_learn_wrapped_estimator(evalml_obj): def generate_component_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML component. - Parameters + Args: element (component instance): The instance of the component to generate string Python code for - Returns: + Returns String representation of Python code that can be run separately in order to recreate the component instance. Does not include code for custom component implementation. """ @@ -295,11 +295,11 @@ def generate_component_code(element): def make_balancing_dictionary(y, sampling_ratio): """Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is. - Parameters + Args: y (pd.Series): Target data sampling_ratio (float): The balanced ratio we want the samples to meet - Returns: + Returns Dictionary where keys are the classes, and the corresponding values are the counts of samples for each class that will satisfy sampling_ratio. """ diff --git a/evalml/pipelines/multiclass_classification_pipeline.py b/evalml/pipelines/multiclass_classification_pipeline.py index fd6af206c4..811e9b05ec 100644 --- a/evalml/pipelines/multiclass_classification_pipeline.py +++ b/evalml/pipelines/multiclass_classification_pipeline.py @@ -5,7 +5,7 @@ class MulticlassClassificationPipeline(ClassificationPipeline): """Pipeline subclass for all multiclass classification pipelines. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 38b5699f5f..0c977e145b 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -40,7 +40,7 @@ class PipelineBase(ABC, metaclass=PipelineBaseMeta): """Machine learning pipeline made out of transformers and an Estimator. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -130,7 +130,7 @@ def name(self): def summary(self): """A short summary of the pipeline structure, describing the list of components used. - Examples Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder + Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ component_graph = [ type(self.component_graph.component_instances[component]) @@ -204,10 +204,10 @@ def __setitem__(self, index, value): def get_component(self, name): """Returns component by name. - Parameters + Args: name (str): Name of component - Returns: + Returns Component: Component to return """ return self.component_graph.get_component(name) @@ -215,10 +215,10 @@ def get_component(self, name): def describe(self, return_dict=False): """Outputs pipeline details including component parameters. - Parameters + Args: return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. - Returns: + Returns dict: Dictionary of all component parameters if return_dict is True, else None """ log_title(logger, self.name) @@ -249,10 +249,10 @@ def describe(self, return_dict=False): def compute_estimator_features(self, X, y=None): """Transforms the data by applying all pre-processing components. - Parameters + Args: X (pd.DataFrame): Input data to the pipeline to transform. - Returns: + Returns pd.DataFrame: New transformed features. """ return self.component_graph.compute_final_component_features(X, y=y) @@ -266,22 +266,22 @@ def _fit(self, X, y): def fit(self, X, y): """Build a model. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training data of length [n_samples]. - Returns: + Returns self """ def transform(self, X, y=None): """Transform the input. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. - Returns: + Returns pd.DataFrame: Transformed output. """ return self.component_graph.transform(X, y) @@ -289,11 +289,11 @@ def transform(self, X, y=None): def predict(self, X, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. objective (Object or string): The objective to use to make predictions. - Returns: + Returns pd.Series: Predicted values. """ X = infer_feature_types(X) @@ -305,12 +305,12 @@ def predict(self, X, objective=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns: + Returns dict: Ordered dictionary of objective scores. """ @@ -323,7 +323,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): Will raise a PipelineScoreError if any objectives fail. - Parameters + Args: X (pd.DataFrame): The feature matrix. y (pd.Series): The target data. y_pred (pd.Series): The pipeline predictions. @@ -331,7 +331,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): Will be a DataFrame for multiclass problems and Series otherwise. Will be None for regression problems. objectives (list): List of objectives to score. - Returns: + Returns dict: Ordered dictionary with objectives and their scores. """ scored_successfully = OrderedDict() @@ -381,7 +381,7 @@ def model_family(self): def parameters(self): """Parameter dictionary for this pipeline. - Returns: + Returns dict: Dictionary of all component parameters. """ components = [ @@ -399,7 +399,7 @@ def parameters(self): def feature_importance(self): """Importance associated with each feature. Features dropped by the feature selection are excluded. - Returns: + Returns pd.DataFrame including feature names and their corresponding importance """ feature_names = self.input_feature_names[self._estimator_name] @@ -413,10 +413,10 @@ def feature_importance(self): def graph(self, filepath=None): """Generate an image representing the pipeline graph. - Parameters + Args: filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. - Returns: + Returns graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ graphviz = import_or_raise( @@ -468,10 +468,10 @@ def graph(self, filepath=None): def graph_feature_importance(self, importance_threshold=0): """Generate a bar graph of the pipeline's feature importance. - Parameters + Args: importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. - Returns: + Returns plotly.Figure, a bar graph showing features and their corresponding importance """ go = import_or_raise( @@ -515,11 +515,11 @@ def graph_feature_importance(self, importance_threshold=0): def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves pipeline at file path. - Parameters + Args: file_path (str): location to save file pickle_protocol (int): the pickle data stream format. - Returns: + Returns None """ with open(file_path, "wb") as f: @@ -529,10 +529,10 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): def load(file_path): """Loads pipeline at file path. - Parameters + Args: file_path (str): location to load file - Returns: + Returns PipelineBase object """ with open(file_path, "rb") as f: @@ -541,7 +541,7 @@ def load(file_path): def clone(self): """Constructs a new pipeline with the same components, parameters, and random state. - Returns: + Returns A new instance of this pipeline with identical components, parameters, and random state. """ return self.__class__( @@ -554,11 +554,11 @@ def clone(self): def new(self, parameters, random_seed=0): """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. Not to be confused with python's __new__ method. - Parameters + Args: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. - Returns: + Returns A new instance of this pipeline with identical components. """ return self.__class__( @@ -677,11 +677,11 @@ def create_objectives(objectives): def can_tune_threshold_with_objective(self, objective): """Determine whether the threshold of a binary classification pipeline can be tuned. - Parameters + Args: pipeline (PipelineBase): Binary classification pipeline. objective (ObjectiveBase): Primary AutoMLSearch objective. - Returns: + Returns bool: True if the pipeline threshold can be tuned. """ return ( @@ -695,7 +695,7 @@ def inverse_transform(self, y): Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). - Parameters + Args: y (pd.Series): Final component features """ return self.component_graph.inverse_transform(y) @@ -703,10 +703,10 @@ def inverse_transform(self, y): def get_hyperparameter_ranges(self, custom_hyperparameters): """Returns hyperparameter ranges from all components as a dictionary. - Parameters + Args: custom_hyperparameters (dict): Custom hyperparameters for the pipeline. - Returns: + Returns dict: Dictionary of hyperparameter ranges for each component in the pipeline. """ hyperparameter_ranges = dict() diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 46e7b6c722..01fc3edf60 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -6,7 +6,7 @@ class RegressionPipeline(PipelineBase): """Pipeline subclass for all regression pipelines. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -24,11 +24,11 @@ class RegressionPipeline(PipelineBase): def fit(self, X, y): """Build a regression model. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training data of length [n_samples] - Returns: + Returns self """ X = infer_feature_types(X) @@ -42,12 +42,12 @@ def fit(self, X, y): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): True values of length [n_samples] objectives (list): Non-empty list of objectives to score on - Returns: + Returns dict: Ordered dictionary of objective scores """ objectives = self.create_objectives(objectives) diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 245ade1656..81fb3088fe 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -18,7 +18,7 @@ class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPipeline): """Pipeline base class for time series classification problems. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -34,11 +34,11 @@ class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPip def fit(self, X, y): """Fit a time series classification pipeline. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training targets of length [n_samples] - Returns: + Returns self """ X, y = self._convert_to_woodwork(X, y) @@ -71,12 +71,12 @@ def _predict(self, X, y, objective=None, pad=False): def predict(self, X, y=None, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. - Returns: + Returns pd.Series: Predicted values. """ if self.estimator is None: @@ -98,10 +98,10 @@ def predict(self, X, y=None, objective=None): def predict_proba(self, X, y=None): """Make probability estimates for labels. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. - Returns: + Returns pd.DataFrame: Probability estimates. """ if self.estimator is None: @@ -120,12 +120,12 @@ def predict_proba(self, X, y=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns: + Returns dict: Ordered dictionary of objective scores. """ X, y = self._convert_to_woodwork(X, y) @@ -153,7 +153,7 @@ class TimeSeriesBinaryClassificationPipeline( ): """Pipeline base class for time series binary classification problems. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -207,7 +207,7 @@ def _score(X, y, predictions, objective): class TimeSeriesMulticlassClassificationPipeline(TimeSeriesClassificationPipeline): """Pipeline base class for time series multiclass classification problems. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 7aa0eeffa7..c2078c53d0 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -9,7 +9,7 @@ class TimeSeriesPipelineBase(PipelineBase, metaclass=PipelineBaseMeta): """Pipeline base class for time series problems. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -56,11 +56,11 @@ def _convert_to_woodwork(X, y): def fit(self, X, y): """Fit a time series pipeline. - Parameters + Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training targets of length [n_samples]. - Returns: + Returns self """ X, y = self._convert_to_woodwork(X, y) diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 68d5f9b228..1dd80461df 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -10,7 +10,7 @@ class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase): """Pipeline base class for time series regression problems. - Parameters + Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph @@ -29,12 +29,12 @@ class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase): def predict(self, X, y=None, objective=None): """Make predictions using selected features. - Parameters + Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. - Returns: + Returns pd.Series: Predicted values. """ if self.estimator is None: @@ -56,12 +56,12 @@ def predict(self, X, y=None, objective=None): def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. - Parameters + Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns: + Returns dict: Ordered dictionary of objective scores. """ X, y = self._convert_to_woodwork(X, y) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 7afface60b..5b5dcebe54 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -59,15 +59,15 @@ def _get_preprocessing_components( """ Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. - Parameters - ---------- + Args: + X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. problem_type (ProblemTypes or str): Problem type. estimator_class (class): A class which subclasses Estimator estimator for pipeline. sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None. - Returns: + Returns list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ @@ -192,8 +192,8 @@ def make_pipeline( """ Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. - Parameters - ---------- + Args: + X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. estimator (Estimator): Estimator for pipeline. @@ -205,7 +205,7 @@ def make_pipeline( extra_components (list(ComponentBase)): List of extra components to be added after preprocessing components. Defaults to None. Returns - ------- + PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. """ X = infer_feature_types(X) @@ -236,10 +236,10 @@ def make_pipeline( def generate_pipeline_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline. - Parameters + Args: element (pipeline instance): The instance of the pipeline to generate string Python code - Returns: + Returns String representation of Python code that can be run separately in order to recreate the pipeline instance. Does not include code for custom component implementation. """ @@ -265,7 +265,7 @@ def _make_stacked_ensemble_pipeline( ): """Creates a pipeline with a stacked ensemble estimator. - Parameters + Args: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. problem_type (ProblemType): problem type of pipeline @@ -273,7 +273,7 @@ def _make_stacked_ensemble_pipeline( None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to -1. - Returns: + Returns Pipeline with appropriate stacked ensemble estimator. """ parameters = {} @@ -320,10 +320,10 @@ def _make_stacked_ensemble_pipeline( def _make_component_list_from_actions(actions): """Creates a list of components from the input DataCheckAction list. - Parameters + Args: actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components - Returns: + Returns List of components used to address the input actions """ components = [] diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py index 81c46505a3..77fa844910 100644 --- a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py +++ b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py @@ -6,11 +6,9 @@ class BalancedClassificationSampler(SamplerBase): - """ - Class for balanced classification downsampler. + """Class for balanced classification downsampler. - Parameters - --------- + Args: sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. sampling_ratio_dict (dict): A dictionary specifying the desired balanced ratio for each target value. Overrides sampling_ratio if provided. @@ -22,7 +20,6 @@ class BalancedClassificationSampler(SamplerBase): To determine severe imbalance, the minority class must have a class ratio below this and must occur less often than min_samples. Must be between 0 and 0.5, inclusive. Defaults to 0.1. random_seed (int): The seed to use for random sampling. Defaults to 0. - """ def __init__( @@ -53,15 +50,12 @@ def __init__( self.sampling_ratio_dict = sampling_ratio_dict or {} def _find_ideal_samples(self, y): - """ - Return dictionary of examples to drop for each class if we need to resample. + """Return dictionary of examples to drop for each class if we need to resample. - Parameters - --------- + Args: y (pd.Series): Target data passed in. Returns - ------- (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """ @@ -90,15 +84,12 @@ def _find_ideal_samples(self, y): return {k: v for k, v in drop_values.items() if v > 0} def _sampling_dict_to_remove_dict(self, y): - """ - Turn the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. + """Turn the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. - Parameters - --------- + Args: y (pd.Series): Training data targets. - Returns - ------- + Returns: (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """ @@ -109,16 +100,13 @@ def _sampling_dict_to_remove_dict(self, y): return new_dic def fit_resample(self, X, y): - """ - Resampling technique for this sampler. + """Resampling technique for this sampler. - Parameters - --------- + Args: X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. - Returns - ------- + Returns: list: Indices to keep for training data. """ y = infer_feature_types(y) diff --git a/evalml/preprocessing/data_splitters/sampler_base.py b/evalml/preprocessing/data_splitters/sampler_base.py index db554cadcf..5d59db950f 100644 --- a/evalml/preprocessing/data_splitters/sampler_base.py +++ b/evalml/preprocessing/data_splitters/sampler_base.py @@ -3,13 +3,10 @@ class SamplerBase(ABC): - """ - Base class for all custom samplers. + """Base class for all custom samplers. - Parameters - --------- + Args: random_seed (int): The seed to use for random sampling. Defaults to 0. - """ def __init__(self, random_seed=0): @@ -17,16 +14,12 @@ def __init__(self, random_seed=0): @abstractmethod def fit_resample(self, X, y): - """ - Resample the input data with this sampling strategy. + """Resample the input data with this sampling strategy. - Parameters - --------- + Args: X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. - Returns - ------- + Returns: Tuple(pd.DataFrame, pd.Series) or list: resampled X and y data for oversampling or indices to keep for undersampling. - """ diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 5c3c8123a5..c9e22f5457 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -5,16 +5,14 @@ class TimeSeriesSplit(BaseCrossValidator): - """ - Rolling Origin Cross Validation for time series problems. + """Rolling Origin Cross Validation for time series problems. This class uses max_delay and gap values to take into account that evalml time series pipelines perform some feature and target engineering, e.g delaying input features and shifting the target variable by the desired amount. If the data that will be split already has all the features and appropriate target values, and then set max_delay and gap to 0. - Parameters - --------- + Args: max_delay (int): Max delay value for feature engineering. Time series pipelines create delayed features from existing features. This process will introduce NaNs into the first max_delay number of rows. The splitter uses the last max_delay number of rows from the previous split as the first max_delay number @@ -40,21 +38,18 @@ def _check_if_empty(data): return data is None or data.empty def split(self, X, y=None, groups=None): - """ - Get the time series splits. + """Get the time series splits. X and y are assumed to be sorted in ascending time order. This method can handle passing in empty or None X and y data but note that X and y cannot be None or empty at the same time. - Parameters - --------- + Args: X (pd.DataFrame, None): Features to split. y (pd.DataFrame, None): Target variable to split. Defaults to None. groups: Ignored but kept for compatibility with sklearn API. Defaults to None. - Returns - ------- + Returns: Iterator of (train, test) indices tuples. """ # Sklearn splitters always assume a valid X is passed but we need to support the diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index de0af50e8b..54b9c3914f 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -7,8 +7,7 @@ class TrainingValidationSplit(BaseCrossValidator): """Split the training data into training and validation sets. - Parameters - --------- + Args: test_size (float): What percentage of data points should be included in the validation set. Defalts to the complement of `train_size` if `train_size` is set, and 0.25 otherwise. train_size (float): What percentage of data points should be included in the training set. @@ -41,13 +40,11 @@ def get_n_splits(): def split(self, X, y=None): """Divide the data into training and testing sets. - Parameters - --------- + Args: X (pd.DataFrame): Dataframe of points to split y (pd.Series): Series of points to split - Returns - ------- + Returns: list: Indices to split data into training and test set """ train, test = train_test_split( diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 47f930df12..01e8db0e54 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -12,11 +12,9 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwargs): - """ - Load features and target from file. + """Load features and target from file. - Parameters - --------- + Args: path (str): Path to file or a http/ftp/s3 URL. index (str): Column for index. target (str): Column for target. @@ -24,8 +22,7 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg drop (list): List of columns to drop. Defaults to None. verbose (bool): If True, prints information about features and target. Defaults to True. - Returns - ------- + Returns: pd.DataFrame, pd.Series: Features matrix and target. """ feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs) @@ -53,8 +50,7 @@ def split_data( ): """Split data into train and test sets. - Parameters - --------- + Args: X (pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. @@ -63,10 +59,8 @@ def split_data( test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. - Returns - ------- + Returns: pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. - """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -96,17 +90,13 @@ def split_data( def number_of_features(dtypes): - """ - Get the number of features of each specific dtype in a DataFrame. + """Get the number of features of each specific dtype in a DataFrame. - Parameters - --------- + Args: dtypes (pd.Series): DataFrame.dtypes to get the number of features for. - Returns - ------- + Returns: pd.Series: dtypes and the number of features for each input type. - """ dtype_to_vtype = { "bool": "Boolean", @@ -122,15 +112,12 @@ def number_of_features(dtypes): def target_distribution(targets): - """ - Get the target distributions. + """Get the target distributions. - Parameters - --------- + Args: targets (pd.Series): Target data. - Returns - ------- + Returns: pd.Series: Target data and their frequency distribution as percentages. """ distribution = targets.value_counts() / len(targets) @@ -138,16 +125,13 @@ def target_distribution(targets): def drop_nan_target_rows(X, y): - """ - Drop rows in X and y when row in the target y has a value of NaN. + """Drop rows in X and y when row in the target y has a value of NaN. - Parameters - --------- + Args: X (pd.DataFrame, np.ndarray): Data to transform. y (pd.Series, np.ndarray): Target data. - Returns - ------- + Returns: pd.DataFrame, pd.DataFrame: Transformed X (and y, if passed in) with rows that had a NaN value removed. """ X_t = X diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index faa4d539fb..8e44cfa949 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -38,7 +38,7 @@ def _all_values(cls): def all_problem_types(cls): """Get a list of all defined problem types. - Returns: + Returns list(ProblemTypes): list """ return list(cls) diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 0f5237adc7..6008064239 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -7,10 +7,10 @@ def handle_problem_types(problem_type): """Handles problem_type by either returning the ProblemTypes or converting from a str. - Parameters + Args: problem_type (str or ProblemTypes): Problem type that needs to be handled - Returns: + Returns ProblemTypes """ if isinstance(problem_type, str): @@ -29,13 +29,13 @@ def handle_problem_types(problem_type): def detect_problem_type(y): """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) Ignores missing and null data. - Parameters + Args: y (pd.Series): the target labels to predict - Returns: + Returns ProblemType: ProblemType Enum - Examples + Example: >>> y = pd.Series([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1]) >>> problem_type = detect_problem_type(y) >>> assert problem_type == ProblemTypes.BINARY @@ -55,10 +55,10 @@ def detect_problem_type(y): def is_regression(problem_type): """Determines if the provided problem_type is a regression problem type. - Parameters + Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns: + Returns bool: Whether or not the provided problem_type is a regression problem type. """ return handle_problem_types(problem_type) in [ @@ -70,10 +70,10 @@ def is_regression(problem_type): def is_binary(problem_type): """Determines if the provided problem_type is a binary classification problem type. - Parameters + Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns: + Returns bool: Whether or not the provided problem_type is a binary classification problem type. """ return handle_problem_types(problem_type) in [ @@ -85,10 +85,10 @@ def is_binary(problem_type): def is_multiclass(problem_type): """Determines if the provided problem_type is a multiclass classification problem type. - Parameters + Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns: + Returns bool: Whether or not the provided problem_type is a multiclass classification problem type. """ return handle_problem_types(problem_type) in [ @@ -100,10 +100,10 @@ def is_multiclass(problem_type): def is_classification(problem_type): """Determines if the provided problem_type is a classification problem type. - Parameters + Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns: + Returns bool: Whether or not the provided problem_type is a classification problem type. """ return is_binary(problem_type) or is_multiclass(problem_type) @@ -112,10 +112,10 @@ def is_classification(problem_type): def is_time_series(problem_type): """Determines if the provided problem_type is a time series problem type. - Parameters + Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns: + Returns bool: Whether or not the provided problem_type is a time series problem type. """ return handle_problem_types(problem_type) in [ diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 9212dff764..a55921e554 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1046,7 +1046,7 @@ def _imbalanced_data_X_y(problem_type, categorical_columns, size): For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. - Parameters + Args: problem_type (str): Either 'binary' or 'multiclass' categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 @@ -1093,7 +1093,7 @@ class _AutoMLTestEnv: tests that patch Estimator.fit instead of Pipeline.fit or tests that only want to patch a selective subset of the methods listed above. - Examples + Example: >>> env = _AutoMLTestEnv(problem_type="binary") >>> # run_search is short-hand for creating the context manager and then running search >>> # env.run_search(automl, score_return_value={automl.objective.name: 1.0}) @@ -1106,7 +1106,7 @@ class _AutoMLTestEnv: def __init__(self, problem_type): """Create a test environment. - Parameters + Args: problem_type (str): The problem type corresponding to the search class you want to test. Attributes: @@ -1206,7 +1206,7 @@ def test_context( ): """A context manager for creating an environment that patches time-consuming pipeline methods. Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. - Parameters + Args: score_return_value: Passed as the return_value argument of the pipeline.score patch. mock_score_side_effect: Passed as the side_effect argument of the pipeline.score patch. Takes precedence over score_return_value. diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 772cae3c58..71156f931c 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -587,7 +587,7 @@ def validate(self, X, y=None): [MockCheck], {"mock_check": 1}, DataCheckInitError, - "Parameters for mock_check were not in a dictionary. Received 1.", + "Args: for mock_check were not in a dictionary. Received 1.", ), ( [MockCheck], diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index ccd7b3ab3e..e1dc12f2fa 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -63,7 +63,7 @@ def test_error_metrics(): ( pd.DataFrame({"a": range(15)}), pd.Series(range(12)), - "^Parameters y_true and input_features must have the same number of data points.", + "^Args: y_true and input_features must have the same number of data points.", ), ] @@ -152,7 +152,7 @@ def test_output_format_checked(): regression_best_worst_answer = """Test Pipeline Name - Parameters go here + Args: go here Best 1 of 1 @@ -220,7 +220,7 @@ def test_output_format_checked(): no_best_worst_answer = """Test Pipeline Name - Parameters go here + Args: go here 1 of 2 @@ -253,7 +253,7 @@ def test_output_format_checked(): binary_best_worst_answer = """Test Pipeline Name - Parameters go here + Args: go here Best 1 of 1 @@ -339,7 +339,7 @@ def test_output_format_checked(): multiclass_best_worst_answer = """Test Pipeline Name - Parameters go here + Args: go here Best 1 of 1 @@ -414,7 +414,7 @@ def test_output_format_checked(): multiclass_no_best_worst_answer = """Test Pipeline Name - Parameters go here + Args: go here 1 of 2 @@ -846,7 +846,7 @@ def test_explain_predictions_best_worst_and_explain_predictions( } pipeline = MagicMock() - pipeline.parameters = "Parameters go here" + pipeline.parameters = "Args: go here" input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index) pipeline.problem_type = problem_type pipeline.name = "Test Pipeline Name" @@ -958,7 +958,7 @@ def _add_custom_index(answer, index_best, index_worst, output_format): regression_custom_metric_answer = """Test Pipeline Name - Parameters go here + Args: go here Best 1 of 1 @@ -1032,7 +1032,7 @@ def test_explain_predictions_best_worst_custom_metric( else {"explanations": ["explanation_dictionary_goes_here"]} ) pipeline = MagicMock() - pipeline.parameters = "Parameters go here" + pipeline.parameters = "Args: go here" input_features = pd.DataFrame({"a": [5, 6]}) pipeline.problem_type = ProblemTypes.REGRESSION pipeline.name = "Test Pipeline Name" diff --git a/evalml/tests/objective_tests/test_cost_benefit_matrix.py b/evalml/tests/objective_tests/test_cost_benefit_matrix.py index 04a15452d6..7ce5eeff44 100644 --- a/evalml/tests/objective_tests/test_cost_benefit_matrix.py +++ b/evalml/tests/objective_tests/test_cost_benefit_matrix.py @@ -8,25 +8,25 @@ def test_cbm_init(): with pytest.raises( - ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ValueError, match="Args: to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=None, true_negative=-1, false_positive=-7, false_negative=-2 ) with pytest.raises( - ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ValueError, match="Args: to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=1, true_negative=-1, false_positive=None, false_negative=-2 ) with pytest.raises( - ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ValueError, match="Args: to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=1, true_negative=None, false_positive=-7, false_negative=-2 ) with pytest.raises( - ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ValueError, match="Args: to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=3, true_negative=-1, false_positive=-7, false_negative=None diff --git a/evalml/tuners/__init__.py b/evalml/tuners/__init__.py index 86fbae65e9..d434a607d8 100644 --- a/evalml/tuners/__init__.py +++ b/evalml/tuners/__init__.py @@ -1,3 +1,4 @@ +"""EvalML tuner classes.""" from .skopt_tuner import SKOptTuner from .tuner import Tuner from .tuner_exceptions import NoParamsException, ParameterError diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index 414696724f..488b8a793a 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -1,3 +1,4 @@ +"""Grid Search Optimizer, which generates all of the possible points to search for using a grid.""" import itertools from skopt.space import Integer, Real @@ -9,13 +10,13 @@ class GridSearchTuner(Tuner): """Grid Search Optimizer, which generates all of the possible points to search for using a grid. - Parameters + Args: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters n_points (int): The number of points to sample from along each dimension defined in the ``space`` argument. Defaults to 10. random_seed (int): Seed for random number generator. Unused in this class, defaults to 0. - Examples + Example: >>> tuner = GridSearchTuner({'My Component': {'param a': [0.0, 10.0], 'param b': ['a', 'b', 'c']}}, n_points=5) >>> proposal = tuner.propose() >>> assert proposal.keys() == {'My Component'} @@ -57,7 +58,7 @@ def __init__(self, pipeline_hyperparameter_ranges, n_points=10, random_seed=0): def add(self, pipeline_parameters, score): """Not applicable to grid search tuner as generated parameters are not dependent on scores of previous parameters. - Parameters + Args: pipeline_parameters (dict): a dict of the parameters used to evaluate a pipeline score (float): the score obtained by evaluating the pipeline with the provided parameters """ @@ -80,11 +81,11 @@ def propose(self): def is_search_space_exhausted(self): """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in ``self.curr_params`` to be returned by ``propose()``. + Returns: + bool: If no more valid parameters exists in the search space, return False. + Raises: NoParamsException: If a search space is exhausted, then this exception is thrown. - - Returns: - bool: If no more valid parameters exists in the search space, return false. """ try: self.curr_params = next(self._grid_points) diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index e6f3094ce9..03a2ceef97 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -1,3 +1,4 @@ +"""Random Search Optimizer.""" from skopt import Space from evalml.tuners import NoParamsException, Tuner @@ -7,7 +8,7 @@ class RandomSearchTuner(Tuner): """Random Search Optimizer. - Parameters + Args: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters with_replacement (bool): If false, only unique hyperparameters will be shown replacement_max_attempts (int): The maximum number of tries to get a unique @@ -15,7 +16,7 @@ class RandomSearchTuner(Tuner): with_replacement=True random_seed (int): Seed for random number generator. Defaults to 0. - Examples + Example: >>> tuner = RandomSearchTuner({'My Component': {'param a': [0.0, 10.0], 'param b': ['a', 'b', 'c']}}, random_seed=42) >>> proposal = tuner.propose() >>> assert proposal.keys() == {'My Component'} @@ -41,7 +42,7 @@ def __init__( def add(self, pipeline_parameters, score): """Not applicable to random search tuner as generated parameters are not dependent on scores of previous parameters. - Parameters + Args: pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline score (float): The score obtained by evaluating the pipeline with the provided parameters """ @@ -72,11 +73,11 @@ def propose(self): def is_search_space_exhausted(self): """Checks if it is possible to generate a set of valid parameters. Stores generated parameters in ``self.curr_params`` to be returned by ``propose()``. + Returns: + bool: If no more valid parameters exists in the search space, return False. + Raises: NoParamsException: If a search space is exhausted, then this exception is thrown. - - Returns: - bool: If no more valid parameters exists in the search space, return false. """ if self._with_replacement: return False diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 71935e8a66..3a1469964c 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -1,3 +1,4 @@ +"""Bayesian Optimizer.""" import warnings import pandas as pd @@ -14,7 +15,7 @@ class SKOptTuner(Tuner): """Bayesian Optimizer. - Parameters + Args: pipeline_hyperparameter_ranges (dict): A set of hyperparameter ranges corresponding to a pipeline's parameters. random_seed (int): The seed for the random number generator. Defaults to 0. """ @@ -31,11 +32,11 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): def add(self, pipeline_parameters, score): """Add score to sample. - Parameters + Args: pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline score (float): The score obtained by evaluating the pipeline with the provided parameters - Returns: + Returns None """ # skip adding nan scores @@ -46,7 +47,7 @@ def add(self, pipeline_parameters, score): self.opt.tell(flat_parameter_values, score) except Exception as e: logger.debug( - "SKOpt tuner received error during add. Score: {}\nParameters: {}\nFlat parameter values: {}\nError: {}".format( + "SKOpt tuner received error during add. Score: {}\nArgs:: {}\nFlat parameter values: {}\nError: {}".format( pipeline_parameters, score, flat_parameter_values, e ) ) @@ -61,7 +62,7 @@ def add(self, pipeline_parameters, score): def propose(self): """Returns a suggested set of parameters to train and score a pipeline with, based off the search space dimensions and prior samples. - Returns: + Returns dict: Proposed pipeline parameters """ with warnings.catch_warnings(): diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 3e5d776133..79e7a2ac41 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -1,14 +1,15 @@ +"""Base Tuner class.""" from abc import ABC, abstractmethod from skopt.space import Categorical, Integer, Real class Tuner(ABC): - """Defines API for base Tuner classes. + """Base Tuner class. Tuners implement different strategies for sampling from a search space. They're used in EvalML to search the space of pipeline hyperparameters. - Parameters + Args: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters. random_seed (int): The random state. Defaults to 0. """ @@ -92,7 +93,7 @@ def _convert_to_pipeline_parameters(self, flat_parameters): def add(self, pipeline_parameters, score): """Register a set of hyperparameters with the score obtained from training a pipeline with those hyperparameters. - Parameters + Args: pipeline_parameters (dict): a dict of the parameters used to evaluate a pipeline score (float): the score obtained by evaluating the pipeline with the provided parameters diff --git a/evalml/tuners/tuner_exceptions.py b/evalml/tuners/tuner_exceptions.py index ac7027c49c..c9f8eee367 100644 --- a/evalml/tuners/tuner_exceptions.py +++ b/evalml/tuners/tuner_exceptions.py @@ -1,3 +1,4 @@ +"""Exception thrown by tuner classes.""" class NoParamsException(Exception): """Raised when a tuner exhausts its search space and runs out of parameters to propose.""" diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py index 21e3508014..aa036932e1 100644 --- a/evalml/utils/__init__.py +++ b/evalml/utils/__init__.py @@ -1,3 +1,4 @@ +"""Utility methods.""" from .logger import get_logger, log_subtitle, log_title from .gen_utils import ( classproperty, diff --git a/evalml/utils/base_meta.py b/evalml/utils/base_meta.py index 490b81c0ba..0e33ae9b8a 100644 --- a/evalml/utils/base_meta.py +++ b/evalml/utils/base_meta.py @@ -1,3 +1,4 @@ +"""Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters.""" from abc import ABCMeta from functools import wraps @@ -11,6 +12,7 @@ class BaseMeta(ABCMeta): @classmethod def set_fit(cls, method): + """Wrapper for the fit method.""" @wraps(method) def _set_fit(self, X, y=None): return_value = method(self, X, y) @@ -20,6 +22,7 @@ def _set_fit(self, X, y=None): return _set_fit def __new__(cls, name, bases, dct): + """Create a new instance.""" for attribute in dct: if attribute in cls.FIT_METHODS: dct[attribute] = cls.set_fit(dct[attribute]) diff --git a/evalml/utils/cli_utils.py b/evalml/utils/cli_utils.py index 231cad2ae1..56a1e45dbd 100644 --- a/evalml/utils/cli_utils.py +++ b/evalml/utils/cli_utils.py @@ -1,3 +1,4 @@ +"""CLI functions.""" import locale import os import platform @@ -17,7 +18,7 @@ def print_info(): """Prints information about the system, evalml, and dependencies of evalml. - Returns: + Returns? None """ logger.info("EvalML version: %s" % evalml.__version__) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 1e0ddd2d6d..973109fcfd 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -1,3 +1,4 @@ +"""General utility methods.""" import importlib import os import warnings @@ -20,7 +21,7 @@ def import_or_raise(library, error_msg=None, warning=False): """Attempts to import the requested library by name. If the import fails, raises an ImportError or warning. - Parameters + Args: library (str): the name of the library error_msg (str): error message to return if the import fails warning (bool): if True, import_or_raise gives a warning instead of ImportError. Defaults to False. @@ -75,7 +76,7 @@ def convert_to_seconds(input_str): def get_random_state(seed): """Generates a numpy.random.RandomState instance using seed. - Parameters + Args: seed (None, int, np.random.RandomState object): seed to use to generate numpy.random.RandomState. Must be between SEED_BOUNDS.min_bound and SEED_BOUNDS.max_bound, inclusive. Otherwise, an exception will be thrown. """ if isinstance(seed, (int, np.integer)) and ( @@ -96,7 +97,7 @@ def get_random_seed( To protect against invalid input to a particular library's random number generator, if an int value is provided, and it is outside the bounds "[min_bound, max_bound)", the value will be projected into the range between the min_bound (inclusive) and max_bound (exclusive) using modular arithmetic. - Parameters + Args: random_state (int, numpy.random.RandomState): random state min_bound (None, int): if not default of None, will be min bound when generating seed (inclusive). Must be less than max_bound. max_bound (None, int): if not default of None, will be max bound when generating seed (exclusive). Must be greater than min_bound. @@ -120,8 +121,7 @@ def get_random_seed( class classproperty: """Allows function to be accessed as a class level property. - Examples - + Example: .. code-block:: class LogisticRegressionBinaryPipeline(PipelineBase): @@ -143,19 +143,19 @@ def __init__(self, func): self.func = func def __get__(self, _, klass): + """Get property value.""" return self.func(klass) def _get_subclasses(base_class): """Gets all of the leaf nodes in the hiearchy tree for a given base class. - Parameters + Args: base_class (abc.ABCMeta): Class to find all of the children for. Returns: subclasses (list): List of all children that are not base classes. """ - classes_to_check = base_class.__subclasses__() subclasses = [] @@ -187,7 +187,7 @@ def _get_subclasses(base_class): def get_importable_subclasses(base_class, used_in_automl=True): """Get importable subclasses of a base class. Used to list all of our estimators, transformers, components and pipelines dynamically. - Parameters + Args: base_class (abc.ABCMeta): Base class to find all of the subclasses for. args (list): Args used to instantiate the subclass. [{}] for a pipeline, and [] for all other classes. @@ -222,7 +222,7 @@ def get_importable_subclasses(base_class, used_in_automl=True): def _rename_column_names_to_numeric(X, flatten_tuples=True): """Used in LightGBM and XGBoost estimator classes to rename column names when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that these estimators cannot natively handle. - Parameters + Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features] flatten_tuples (bool): Whether to flatten MultiIndex or tuple column names. LightGBM cannot handle columns with tuple names. @@ -248,13 +248,10 @@ def _rename_column_names_to_numeric(X, flatten_tuples=True): def jupyter_check(): - """Get whether or not the code is being run in a Ipython environment (such as Jupyter Notebook or Jupyter Lab) - - Parameters - None + """Get whether or not the code is being run in a Ipython environment (such as Jupyter Notebook or Jupyter Lab). Returns: - Boolean: True if Ipython, False otherwise + Boolean: True if Ipython, False otherwise. """ try: ipy = import_or_raise("IPython") @@ -266,8 +263,8 @@ def jupyter_check(): def safe_repr(value): """Convert the given value into a string that can safely be used for repr. - Parameters - value: the item to convert + Args: + value: The item to convert Returns: String representation of the value @@ -283,7 +280,7 @@ def safe_repr(value): def is_all_numeric(df): """Checks if the given DataFrame contains only numeric values. - Parameters + Args: df (pd.DataFrame): The DataFrame to check data types of. Returns: @@ -301,7 +298,7 @@ def is_all_numeric(df): def pad_with_nans(pd_data, num_to_pad): """Pad the beginning num_to_pad rows with nans. - Parameters + Args: pd_data (pd.DataFrame or pd.Series): Data to pad. Returns: @@ -326,7 +323,7 @@ def pad_with_nans(pd_data, num_to_pad): def _get_rows_without_nans(*data): """Compute a boolean array marking where all entries in the data are non-nan. - Parameters + Args: *data (sequence of pd.Series or pd.DataFrame) Returns: @@ -351,13 +348,12 @@ def _not_nan(pd_data): def drop_rows_with_nans(*pd_data): """Drop rows that have any NaNs in all dataframes or series. - Parameters + Args: *pd_data (sequence of pd.Series or pd.DataFrame or None) Returns: list of pd.DataFrame or pd.Series or None """ - mask = _get_rows_without_nans(*pd_data) def _subset(pd_data): @@ -371,7 +367,7 @@ def _subset(pd_data): def _file_path_check(filepath=None, format="png", interactive=False, is_plotly=False): """Helper function to check the filepath being passed. - Parameters + Args: filepath (str or Path, optional): Location to save file. format (str): Extension for figure to be saved as. Defaults to 'png'. interactive (bool, optional): If True and fig is of type plotly.Figure, sets the format to 'html'. @@ -406,7 +402,7 @@ def save_plot( ): """Saves fig to filepath if specified, or to a default location if not. - Parameters + Args: fig (Figure): Figure to be saved. filepath (str or Path, optional): Location to save file. Default is with filename "test_plot". format (str): Extension for figure to be saved as. Ignored if interactive is True and fig @@ -474,7 +470,7 @@ def save_plot( def deprecate_arg(old_arg, new_arg, old_value, new_value): """Helper to raise warnings when a deprecated arg is used. - Parameters + Args: old_arg (str): Name of old/deprecated argument. new_arg (str): Name of new argument. old_value (Any): Value the user passed in for the old argument. diff --git a/evalml/utils/logger.py b/evalml/utils/logger.py index 5c716cfe95..a1fe0a2a8d 100644 --- a/evalml/utils/logger.py +++ b/evalml/utils/logger.py @@ -1,3 +1,4 @@ +"""Logging functions.""" import logging import os import sys @@ -8,6 +9,11 @@ def get_logger(name): + """Get the logger with the associated name. + + Args: + name (str): Name of the logger to get. + """ logger = logging.getLogger(name) if not len(logger.handlers): logger.setLevel(logging.DEBUG) @@ -44,6 +50,7 @@ def get_logger(name): def log_title(logger, title): + """Log with a title.""" logger.info("\n" + "*" * (len(title) + 4)) logger.info("* %s *" % title) logger.info("*" * (len(title) + 4)) @@ -51,6 +58,7 @@ def log_title(logger, title): def log_subtitle(logger, title, underline="="): + """Log with a subtitle.""" logger.info("") logger.info("%s" % title) logger.info(underline * len(title)) @@ -59,13 +67,12 @@ def log_subtitle(logger, title, underline="="): def time_elapsed(start_time): """How much time has elapsed since the search started. - Parameters + Args: start_time (int): Time when search started. Returns: str: elapsed time formatted as a string [H:]MM:SS """ - time_diff = time.time() - start_time # Source: tqdm.std.tqdm.format_interval mins, s = divmod(int(time_diff), 60) diff --git a/evalml/utils/update_checker.py b/evalml/utils/update_checker.py index 9bac8ff898..34cc5451a4 100644 --- a/evalml/utils/update_checker.py +++ b/evalml/utils/update_checker.py @@ -1,3 +1,4 @@ +"""Check if EvalML has updated since the user installed.""" from pkg_resources import iter_entry_points for entry_point in iter_entry_points("alteryx_open_src_initialize"): diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index fa995f5ffc..5d679bdb97 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -1,3 +1,4 @@ +"""Woodwork utility methods.""" import numpy as np import pandas as pd import woodwork as ww @@ -47,13 +48,13 @@ def _raise_value_error_if_nullable_types_detected(data): def infer_feature_types(data, feature_types=None): """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork. - Parameters + Args: data (pd.DataFrame, pd.Series): Input data to convert to a Woodwork data structure. feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage") - Returns: + Returns A Woodwork data structure where the data type of each column was either specified or inferred. """ if isinstance(data, list): @@ -114,13 +115,13 @@ def _retain_custom_types_and_initalize_woodwork( ): """Helper method which will take an old Woodwork data structure and a new pandas data structure and return a new data structure that will try to retain as many logical types from the old data structure that exist in the new pandas data structure as possible. - Parameters + Args: old_logical_types (Dict): Logical types to try to retain. new_dataframe (pd.DataFrame): Pandas data structure ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataFrame that have a logical type specified in this list will not have their logical types carried over to the new DataFrame returned - Returns: + Returns A new DataFrame where any of the columns that exist in the old input DataFrame and the new DataFrame try to retain the original logical type, if possible and not specified to be ignored. """ @@ -151,11 +152,11 @@ def _retain_custom_types_and_initalize_woodwork( def _convert_numeric_dataset_pandas(X, y): """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. Used with data sampler strategies. - Parameters + Args: X (pd.DataFrame, np.ndarray): Data to transform y (pd.Series, np.ndarray): Target data - Returns: + Returns Tuple(pd.DataFrame, pd.Series): Transformed X and y """ X_ww = infer_feature_types(X) From 3ad38478a272af430030f6bedca9221eb5ba8c32 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 00:30:08 -0400 Subject: [PATCH 14/62] fixing more --- evalml/automl/automl_search.py | 1 + evalml/automl/engine/cf_engine.py | 4 ++-- evalml/automl/engine/sequential_engine.py | 6 +++--- evalml/automl/pipeline_search_plots.py | 1 + .../data_checks/multicollinearity_data_check.py | 6 +++--- .../natural_language_nan_data_check.py | 2 +- evalml/data_checks/sparsity_data_check.py | 2 +- .../data_checks/target_distribution_data_check.py | 4 ++-- evalml/data_checks/target_leakage_data_check.py | 2 +- evalml/model_family/__init__.py | 1 + evalml/model_family/model_family.py | 3 +++ evalml/model_family/utils.py | 4 ++-- .../prediction_explanations/explainers.py | 3 ++- evalml/objectives/cost_benefit_matrix.py | 4 +--- .../transformers/samplers/base_sampler.py | 2 +- .../transformers/samplers/undersampler.py | 3 ++- evalml/pipelines/utils.py | 6 +++--- evalml/problem_types/__init__.py | 1 + evalml/problem_types/problem_types.py | 2 ++ evalml/problem_types/utils.py | 15 ++++++++------- evalml/tuners/tuner_exceptions.py | 2 ++ evalml/utils/base_meta.py | 1 + evalml/utils/logger.py | 2 +- 23 files changed, 45 insertions(+), 32 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 339d3bfef9..862a38890d 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -707,6 +707,7 @@ def _validate_objective(self, objective): def __str__(self): """Returns string representation of the AutoMLSearch object.""" + def _print_list(obj_list): lines = sorted(["\t{}".format(o.name) for o in obj_list]) return "\n".join(lines) diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index c6b8cc025f..9b2b413c4e 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -10,7 +10,7 @@ class CFClient: """Custom CFClient API to match Dask's CFClient and allow context management. - + Args: pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): the resource pool to execute the futures work on. """ @@ -33,7 +33,7 @@ def submit(self, *args, **kwargs): class CFComputation(EngineComputation): """A Future-like wrapper around jobs created by the CFEngine. - + Args: future(cf.Future): The concurrent.futures.Future that is desired to be executed. """ diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index dd05d3cc04..39e9d1af0a 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -51,7 +51,7 @@ class SequentialEngine(EngineBase): def submit_evaluation_job(self, automl_config, pipeline, X, y): """Submit a job to evaluate a pipeline. - + Args: automl_config: Structure containing data passed from AutoMLSearch instance. pipeline (pipeline.PipelineBase): Pipeline to evaluate. @@ -70,7 +70,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y): def submit_training_job(self, automl_config, pipeline, X, y): """Submit a job to train a pipeline. - + Args: automl_config: Structure containing data passed from AutoMLSearch instance. pipeline (pipeline.PipelineBase): Pipeline to evaluate. @@ -88,7 +88,7 @@ def submit_training_job(self, automl_config, pipeline, X, y): def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): """Submit a job to score a pipeline. - + Args: automl_config: Structure containing data passed from AutoMLSearch instance. pipeline (pipeline.PipelineBase): Pipeline to train. diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index f21c1a90cb..c8cca357b6 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -9,6 +9,7 @@ class SearchIterationPlot: results (dict): Dictionary of current results. objective (ObjectiveBase): Objective that AutoML is optimizing for. """ + def __init__(self, results, objective): self._go = import_or_raise( "plotly.graph_objects", diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index 59d8fb6170..0167f8306b 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -22,14 +22,14 @@ def __init__(self, threshold=0.9): def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. - Args: + Args: X (pd.DataFrame): The input features to check. y (pd.Series): The target. Ignored. - Returns: + Returns: dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. - Example: + Example: >>> import pandas as pd >>> col = pd.Series([1, 0, 2, 3, 4]) >>> X = pd.DataFrame({"col_1": col, "col_2": col * 3}) diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 1fba84674f..be7cd2ee2e 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -19,7 +19,7 @@ def validate(self, X, y=None): dict: dict with a DataCheckError if NaN values are present in natural language columns. Example: - + >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index acab25fbdf..019383b8e9 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -46,7 +46,7 @@ def validate(self, X, y=None): Returns: dict: dict with a DataCheckWarning if there are any sparse columns. - Example: + Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index 8e187b7f2e..a48c69d011 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -21,14 +21,14 @@ def validate(self, X, y): """Check if the target data has a certain distribution. Args: - + X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. Returns: dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. - Example: + Example: >>> from scipy.stats import lognorm >>> X = None >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 2816bcab74..ce82e07b6b 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -73,7 +73,7 @@ def validate(self, X, y): Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Args: - + X (pd.DataFrame, np.ndarray): The input features to check y (pd.Series, np.ndarray): The target data diff --git a/evalml/model_family/__init__.py b/evalml/model_family/__init__.py index f49a8db80e..6cbe374915 100644 --- a/evalml/model_family/__init__.py +++ b/evalml/model_family/__init__.py @@ -1,2 +1,3 @@ +"""Family of machine learning models.""" from .model_family import ModelFamily from .utils import handle_model_family diff --git a/evalml/model_family/model_family.py b/evalml/model_family/model_family.py index f79381ae58..6e2ed44bdc 100644 --- a/evalml/model_family/model_family.py +++ b/evalml/model_family/model_family.py @@ -1,3 +1,4 @@ +"""Enum for family of machine learning models.""" from enum import Enum @@ -47,6 +48,7 @@ class ModelFamily(Enum): """None""" def __str__(self): + """String representation of a ModelFamily enum.""" model_family_dict = { ModelFamily.K_NEIGHBORS.name: "K Nearest Neighbors", ModelFamily.RANDOM_FOREST.name: "Random Forest", @@ -67,6 +69,7 @@ def __str__(self): return model_family_dict[self.name] def __repr__(self): + """String representation of a ModelFamily enum.""" return "ModelFamily." + self.name def is_tree_estimator(self): diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index f18b82ad4d..0a289a6dbe 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -1,3 +1,4 @@ +"""Utility methods for EvalML's model families.""" from .model_family import ModelFamily @@ -7,10 +8,9 @@ def handle_model_family(model_family): Args: model_family (str or ModelFamily): Model type that needs to be handled - Returns + Returns: ModelFamily """ - if isinstance(model_family, str): try: tpe = ModelFamily[model_family.upper()] diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index b5b9fe0c9e..818e9a49ab 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -110,7 +110,7 @@ def explain_predictions( def _update_progress(start_time, current_time, progress_stage, callback_function): """Helper function for updating progress of a function and making a call to the user-provided callback function, if provided. - + The callback function should accept the following parameters: - progress_stage: stage of computation - time_elapsed: total time in seconds that has elapsed since start of call @@ -122,6 +122,7 @@ def _update_progress(start_time, current_time, progress_stage, callback_function class ExplainPredictionsStage(Enum): """Enum for prediction stage.""" + PREPROCESSING_STAGE = "preprocessing_stage" PREDICT_STAGE = "predict_stage" COMPUTE_FEATURE_STAGE = "compute_feature_stage" diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index e87ac983d8..9c4d790335 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -24,9 +24,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): def __init__(self, true_positive, true_negative, false_positive, false_negative): if None in {true_positive, true_negative, false_positive, false_negative}: - raise ValueError( - "Args: to CostBenefitMatrix must all be numeric values." - ) + raise ValueError("Args: to CostBenefitMatrix must all be numeric values.") self.true_positive = true_positive self.true_negative = true_negative diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index d7e68a9454..6753c7cb87 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -51,7 +51,7 @@ def _prepare_data(self, X, y): X (pd.DataFrame): Training features. y (pd.Series): Target. - Returns: + Returns: pd.DataFrame, pd.Series: Prepared X and y data as pandas types """ X = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index d0aa434e13..6cf4a6260d 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -56,7 +56,8 @@ def _initialize_sampler(self, X, y): """Helper function to initialize the undersampler component object. Args: - y (pd.Series): The target data + X (pd.DataFrame): Ignored. + y (pd.Series): The target data. """ param_dic = self._dictionary_to_params( self.parameters["sampling_ratio_dict"], y diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 5b5dcebe54..325d2c9bbb 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -60,7 +60,7 @@ def _get_preprocessing_components( Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. Args: - + X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. problem_type (ProblemTypes or str): Problem type. @@ -193,7 +193,7 @@ def make_pipeline( Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. Args: - + X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. estimator (Estimator): Estimator for pipeline. @@ -205,7 +205,7 @@ def make_pipeline( extra_components (list(ComponentBase)): List of extra components to be added after preprocessing components. Defaults to None. Returns - + PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. """ X = infer_feature_types(X) diff --git a/evalml/problem_types/__init__.py b/evalml/problem_types/__init__.py index 6c1dd73d60..b70c8ab71b 100644 --- a/evalml/problem_types/__init__.py +++ b/evalml/problem_types/__init__.py @@ -1,3 +1,4 @@ +"""The supported types of machine learning problems.""" from .problem_types import ProblemTypes from .utils import ( handle_problem_types, diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index 8e44cfa949..b2d3555a61 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -1,3 +1,4 @@ +"""Enum defining the supported types of machine learning problems.""" from enum import Enum from evalml.utils import classproperty @@ -20,6 +21,7 @@ class ProblemTypes(Enum): """Time series multiclass classification problem.""" def __str__(self): + """String representation of the ProblemTypes enum.""" problem_type_dict = { ProblemTypes.BINARY.name: "binary", ProblemTypes.MULTICLASS.name: "multiclass", diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 6008064239..efafbcdb7b 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -1,3 +1,4 @@ +"""Utility methods for the ProblemTypes enum in EvalML.""" import pandas as pd from pandas.api.types import is_numeric_dtype @@ -10,7 +11,7 @@ def handle_problem_types(problem_type): Args: problem_type (str or ProblemTypes): Problem type that needs to be handled - Returns + Returns: ProblemTypes """ if isinstance(problem_type, str): @@ -32,7 +33,7 @@ def detect_problem_type(y): Args: y (pd.Series): the target labels to predict - Returns + Returns: ProblemType: ProblemType Enum Example: @@ -58,7 +59,7 @@ def is_regression(problem_type): Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns + Returns: bool: Whether or not the provided problem_type is a regression problem type. """ return handle_problem_types(problem_type) in [ @@ -73,7 +74,7 @@ def is_binary(problem_type): Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns + Returns: bool: Whether or not the provided problem_type is a binary classification problem type. """ return handle_problem_types(problem_type) in [ @@ -88,7 +89,7 @@ def is_multiclass(problem_type): Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns + Returns: bool: Whether or not the provided problem_type is a multiclass classification problem type. """ return handle_problem_types(problem_type) in [ @@ -103,7 +104,7 @@ def is_classification(problem_type): Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns + Returns: bool: Whether or not the provided problem_type is a classification problem type. """ return is_binary(problem_type) or is_multiclass(problem_type) @@ -115,7 +116,7 @@ def is_time_series(problem_type): Args: problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - Returns + Returns: bool: Whether or not the provided problem_type is a time series problem type. """ return handle_problem_types(problem_type) in [ diff --git a/evalml/tuners/tuner_exceptions.py b/evalml/tuners/tuner_exceptions.py index c9f8eee367..138b1b179a 100644 --- a/evalml/tuners/tuner_exceptions.py +++ b/evalml/tuners/tuner_exceptions.py @@ -1,4 +1,6 @@ """Exception thrown by tuner classes.""" + + class NoParamsException(Exception): """Raised when a tuner exhausts its search space and runs out of parameters to propose.""" diff --git a/evalml/utils/base_meta.py b/evalml/utils/base_meta.py index 0e33ae9b8a..24e9a42751 100644 --- a/evalml/utils/base_meta.py +++ b/evalml/utils/base_meta.py @@ -13,6 +13,7 @@ class BaseMeta(ABCMeta): @classmethod def set_fit(cls, method): """Wrapper for the fit method.""" + @wraps(method) def _set_fit(self, X, y=None): return_value = method(self, X, y) diff --git a/evalml/utils/logger.py b/evalml/utils/logger.py index a1fe0a2a8d..e89b75e2af 100644 --- a/evalml/utils/logger.py +++ b/evalml/utils/logger.py @@ -10,7 +10,7 @@ def get_logger(name): """Get the logger with the associated name. - + Args: name (str): Name of the logger to get. """ From cf45bb705c7d0421170533a6ea9591ee072c7073 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 00:33:54 -0400 Subject: [PATCH 15/62] add ignore test directory flag to lint command --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eeff7163dc..7ef23d5bf0 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 + pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='[^\.|?!test].*' black evalml -t py39 --check .PHONY: lint-fix From 7bc5d940f202f92e929b9ad7cc04ea44cda7e049 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 13:18:02 -0400 Subject: [PATCH 16/62] more cleanup --- Makefile | 2 +- dev-requirements.txt | 1 - docs/Makefile | 7 +- docs/make.bat | 5 +- docs/notebook_version_standardizer.py | 2 +- docs/source/user_guide/pipelines.ipynb | 2 +- evalml/__init__.py | 2 +- evalml/__main__.py | 6 +- evalml/model_understanding/graphs.py | 2 +- .../permutation_importance.py | 2 +- evalml/objectives/objective_base.py | 1 - evalml/pipelines/__init__.py | 1 + .../binary_classification_pipeline.py | 8 +-- .../binary_classification_pipeline_mixin.py | 5 ++ evalml/pipelines/classification_pipeline.py | 21 +++--- evalml/pipelines/component_graph.py | 30 ++++---- evalml/pipelines/components/__init__.py | 1 + evalml/pipelines/components/component_base.py | 23 +++--- .../components/component_base_meta.py | 1 + .../components/estimators/__init__.py | 1 + .../estimators/classifiers/__init__.py | 1 + .../classifiers/baseline_classifier.py | 26 +++++++ .../classifiers/catboost_classifier.py | 19 +++++ .../classifiers/decision_tree_classifier.py | 1 + .../classifiers/elasticnet_classifier.py | 11 +++ .../estimators/classifiers/et_classifier.py | 1 + .../classifiers/kneighbors_classifier.py | 1 + .../classifiers/lightgbm_classifier.py | 26 +++++++ .../logistic_regression_classifier.py | 2 + .../estimators/classifiers/rf_classifier.py | 1 + .../estimators/classifiers/svm_classifier.py | 3 +- .../classifiers/xgboost_classifier.py | 27 +++++++ .../components/estimators/estimator.py | 23 ++++-- .../estimators/regressors/__init__.py | 1 + .../estimators/regressors/arima_regressor.py | 19 +++++ .../regressors/baseline_regressor.py | 22 +++++- .../regressors/catboost_regressor.py | 11 +++ .../regressors/decision_tree_regressor.py | 1 + .../regressors/elasticnet_regressor.py | 2 + .../estimators/regressors/et_regressor.py | 1 + .../regressors/lightgbm_regressor.py | 19 ++++- .../estimators/regressors/linear_regressor.py | 2 + .../regressors/prophet_regressor.py | 32 ++++++--- .../estimators/regressors/rf_regressor.py | 1 + .../estimators/regressors/svm_regressor.py | 5 +- .../time_series_baseline_estimator.py | 28 ++++++++ .../regressors/xgboost_regressor.py | 19 +++++ .../transformers/imputers/__init__.py | 1 + .../transformers/imputers/imputer.py | 5 +- .../imputers/per_column_imputer.py | 5 +- .../transformers/imputers/simple_imputer.py | 9 +-- .../transformers/imputers/target_imputer.py | 8 +-- .../preprocessing/drop_null_columns.py | 10 +++ .../preprocessing/drop_rows_transformer.py | 19 +++++ .../preprocessing/featuretools.py | 2 +- .../preprocessing/log_transformer.py | 9 +++ .../preprocessing/polynomial_detrender.py | 2 +- .../preprocessing/text_featurizer.py | 3 +- .../transform_primitive_components.py | 1 + .../components/transformers/transformer.py | 8 +-- evalml/pipelines/components/utils.py | 54 +++++++------- evalml/pipelines/pipeline_base.py | 72 +++++++++++-------- evalml/pipelines/pipeline_meta.py | 1 + .../time_series_classification_pipelines.py | 9 +-- evalml/pipelines/time_series_pipeline_base.py | 2 +- .../time_series_regression_pipeline.py | 5 +- evalml/pipelines/utils.py | 25 +++---- 67 files changed, 505 insertions(+), 173 deletions(-) diff --git a/Makefile b/Makefile index 7ef23d5bf0..7cd1a6337d 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean: .PHONY: lint lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='[^\.|?!test].*' + pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' black evalml -t py39 --check .PHONY: lint-fix diff --git a/dev-requirements.txt b/dev-requirements.txt index fd52a4b605..78258aa136 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,5 +4,4 @@ flake8==3.7.0 black==21.5b1 isort==5.0.0 -docformatter==1.4 pydocstyle==6.1.1 diff --git a/docs/Makefile b/docs/Makefile index a8d23876a0..be36fab7e0 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,6 +2,7 @@ # # You can set these variables from the command line. +SPHINXOPTS = -W SPHINXBUILD = sphinx-build SOURCEDIR = source GENDIR = source/generated @@ -17,18 +18,18 @@ clean: .PHONY: html html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html -j 'auto' + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html $(SPHINXOPTS) -j 'auto' @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." # Put it first so that "make" without argument is like "make help". help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(O) + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(O) + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat index fa6005a60c..4d9eb83d9f 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -25,10 +25,11 @@ if errorlevel 9009 ( exit /b 1 ) -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + :end popd diff --git a/docs/notebook_version_standardizer.py b/docs/notebook_version_standardizer.py index 48c51676b6..a8cbdf86c9 100644 --- a/docs/notebook_version_standardizer.py +++ b/docs/notebook_version_standardizer.py @@ -47,7 +47,7 @@ def _standardize_versions(notebooks, desired_version="3.8.6"): @click.group() def cli(): - """no-op.""" + """no-op""" @cli.command() diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index 3ec5ddb16b..af134fc1ab 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -151,7 +151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Pipeline Args:\n", + "## Pipeline Parameters\n", "\n", "You can also pass in custom parameters by using the `parameters` parameter, which will then be used when instantiating each component in `component_graph`. The parameters dictionary needs to be in the format of a two-layered dictionary where the key-value pairs are the component name and corresponding component parameters dictionary. The component parameters dictionary consists of (parameter name, parameter values) key-value pairs.\n", "\n", diff --git a/evalml/__init__.py b/evalml/__init__.py index e821828fc8..f9c21be12e 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -1,4 +1,4 @@ -"""This is a docstring.""" +"""EvalML.""" import warnings # hack to prevent warnings from skopt diff --git a/evalml/__main__.py b/evalml/__main__.py index d92173eb11..404316ce7d 100644 --- a/evalml/__main__.py +++ b/evalml/__main__.py @@ -1,4 +1,4 @@ -"""I'm a docstring.""" +"""CLI commands.""" import click @@ -7,13 +7,13 @@ @click.group() def cli(): - """I'm a docstring.""" + """CLI command with no arguments. Does nothing.""" pass @click.command() def info(): - """I'm a docstring.""" + """CLI command with `info` argument. Prints info about the system, evalml, and dependencies of evalml.""" print_info() diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 120bbdeafb..c045b27aa2 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1,4 +1,4 @@ -"""I'm a docstring.""" +"""Model understanding graphing utilities.""" import copy import os diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index bf492b05fe..519672f761 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -1,4 +1,4 @@ -"""I'm a docstring.""" +"""Permutation importance methods.""" import numpy as np import pandas as pd from joblib import Parallel, delayed diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 1f0f9c05b6..e1f0347e36 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -166,7 +166,6 @@ def calculate_percent_difference(cls, score, baseline_score): as percentages, this will be the difference between the reference score and score. For all other objectives, the difference will be normalized by the reference score. """ - if pd.isna(score) or pd.isna(baseline_score): return np.nan diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py index b1d611ede2..b03e71b880 100644 --- a/evalml/pipelines/__init__.py +++ b/evalml/pipelines/__init__.py @@ -1,3 +1,4 @@ +"""EvalML pipelines.""" from .components import ( Estimator, OneHotEncoder, diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py index cc66e2409b..ac5848a0e9 100644 --- a/evalml/pipelines/binary_classification_pipeline.py +++ b/evalml/pipelines/binary_classification_pipeline.py @@ -1,3 +1,4 @@ +"""Pipeline subclass for all binary classification pipelines.""" from .binary_classification_pipeline_mixin import ( BinaryClassificationPipelineMixin, ) @@ -33,12 +34,11 @@ def _predict(self, X, objective=None): Args: X (pd.DataFrame): Data of shape [n_samples, n_features] - objective (Object or string): The objective to use to make predictions + objective (Object or string): The objective to use to make predictions. - Returns + Returns: pd.Series: Estimated labels """ - if objective is not None: objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(self.problem_type): @@ -58,7 +58,7 @@ def predict_proba(self, X): Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - Returns + Returns: pd.Series: Probability estimates """ return super().predict_proba(X) diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 6bb3b4a98c..494d1c0883 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -1,4 +1,9 @@ +"""Binary classification pipeline mix-in class.""" + + class BinaryClassificationPipelineMixin: + """Binary classification pipeline mix-in class.""" + _threshold = None @property diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index f1db69d46c..1716bf6416 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -1,3 +1,4 @@ +"""Pipeline subclass for all classification pipelines.""" import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -42,7 +43,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training labels of length [n_samples] - Returns + Returns: self """ X = infer_feature_types(X) @@ -81,10 +82,10 @@ def _predict(self, X, objective=None): """Make predictions using selected features. Args: - X (pd.DataFrame): Data of shape [n_samples, n_features] - objective (Object or string): The objective to use to make predictions + X (pd.DataFrame): Data of shape [n_samples, n_features]. + objective (Object or string): The objective to use to make predictions. - Returns + Returns: pd.Series: Estimated labels """ return self.component_graph.predict(X) @@ -93,11 +94,11 @@ def predict(self, X, objective=None): """Make predictions using selected features. Args: - X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] - objective (Object or string): The objective to use to make predictions + X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. + objective (Object or string): The objective to use to make predictions. - Returns - pd.Series: Estimated labels + Returns: + pd.Series: Estimated labels. """ predictions = self._predict(X, objective=objective) predictions = pd.Series( @@ -111,7 +112,7 @@ def predict_proba(self, X): Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - Returns + Returns: pd.DataFrame: Probability estimates """ if self.estimator is None: @@ -136,7 +137,7 @@ def score(self, X, y, objectives): y (pd.Series, or np.ndarray): True labels of length [n_samples] objectives (list): List of objectives to score - Returns + Returns: dict: Ordered dictionary of objective scores """ y = infer_feature_types(y) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 9983502e45..bd98ba71c0 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -126,7 +126,7 @@ def compute_order(self): def default_parameters(self): """The default parameter dictionary for this pipeline. - Returns + Returns: dict: Dictionary of all component default parameters. """ defaults = {} @@ -193,7 +193,7 @@ def fit_features(self, X, y): X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. - Returns + Returns: pd.DataFrame: Transformed values. """ return self._fit_transform_features_helper(True, X, y) @@ -205,7 +205,7 @@ def compute_final_component_features(self, X, y=None): X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. - Returns + Returns: pd.DataFrame: Transformed values. """ return self._fit_transform_features_helper(False, X, y) @@ -218,7 +218,7 @@ def _fit_transform_features_helper(self, needs_fitting, X, y=None): X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Defaults to None. - Returns + Returns: pd.DataFrame: Transformed values. """ if len(self.compute_order) <= 1: @@ -264,7 +264,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Input features of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. - Returns + Returns: pd.DataFrame: Transformed output. """ if len(self.compute_order) == 0: @@ -289,7 +289,7 @@ def predict(self, X): Args: X (pd.DataFrame): Input features of shape [n_samples, n_features]. - Returns + Returns: pd.Series: Predicted values. """ if len(self.compute_order) == 0: @@ -313,7 +313,7 @@ def _compute_features(self, component_list, X, y=None, fit=False): fit (boolean): Whether to fit the estimators as well as transform it. Defaults to False. - Returns + Returns: dict: Outputs from each component. """ X = infer_feature_types(X) @@ -390,8 +390,8 @@ def _get_feature_provenance(self, input_feature_names): Args: input_feature_names (list(str)): Names of the features in the input dataframe. - Returns - dictionary: mapping of feature name to set feature names that were created from that feature. + Returns: + dict: Dictionary mapping of feature name to set feature names that were created from that feature. """ if not self.compute_order: return {} @@ -445,7 +445,7 @@ def get_component(self, component_name): Args: component_name (str): Name of the component to retrieve - Returns + Returns: ComponentBase object """ try: @@ -456,7 +456,7 @@ def get_component(self, component_name): def get_last_component(self): """Retrieves the component that is computed last in the graph, usually the final estimator. - Returns + Returns: ComponentBase object """ if len(self.compute_order) == 0: @@ -467,7 +467,7 @@ def get_last_component(self): def get_estimators(self): """Gets a list of all the estimator components within this graph. - Returns + Returns: list: All estimator objects within the graph. """ if not isinstance(self.get_last_component(), ComponentBase): @@ -503,7 +503,7 @@ def describe(self, return_dict=False): Args: return_dict (bool): If True, return dictionary of information about component graph. Defaults to False. - Returns + Returns: dict: Dictionary of all component parameters if return_dict is True, else None """ components = {} @@ -527,7 +527,7 @@ def graph(self, name=None, graph_format=None): name (str): Name of the graph. Defaults to None. graph_format (str): file format to save the graph in. Defaults to None. - Returns + Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ graphviz = import_or_raise( @@ -621,7 +621,7 @@ def __iter__(self): def __next__(self): """Iterator for graphs, retrieves the components in the graph in order. - Returns + Returns: ComponentBase obj: The next component class or instance in the graph """ if self._i < len(self.compute_order): diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index d35cc65bbf..e5ec246a01 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -1,3 +1,4 @@ +"""EvalML component classes.""" from .component_base import ComponentBase, ComponentBaseMeta from .estimators import ( Estimator, diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index a59509b233..c282cac5a0 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -1,3 +1,4 @@ +"""Base class for all components.""" import copy from abc import ABC, abstractmethod @@ -94,10 +95,9 @@ def default_parameters(cls): Our convention is that Component.default_parameters == Component().parameters. - Returns - dict: default parameters for this component. + Returns: + dict: Default parameters for this component. """ - if cls._default_parameters is None: cls._default_parameters = cls().parameters @@ -110,7 +110,7 @@ def _supported_by_list_API(cls): def clone(self): """Constructs a new component with the same parameters and random state. - Returns + Returns: A new instance of this component with identical parameters and random state. """ return self.__class__(**self.parameters, random_seed=self.random_seed) @@ -119,10 +119,10 @@ def fit(self, X, y=None): """Fits component to data. Args: - X (list, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] - y (list, pd.Series, np.ndarray, optional): The target training data of length [n_samples] + X (pd.DataFrame): The input training data of shape [n_samples, n_features] + y (pd.Series, optional): The target training data of length [n_samples] - Returns + Returns: self """ X = infer_feature_types(X) @@ -143,8 +143,8 @@ def describe(self, print_name=False, return_dict=False): print_name(bool, optional): whether to print name of component return_dict(bool, optional): whether to return description as dictionary in the format {"name": name, "parameters": parameters} - Returns - None or dict: prints and returns dictionary + Returns: + None or dict: Returns dictionary if return_dict is True, else None. """ if print_name: title = self.name @@ -166,7 +166,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): file_path (str): Location to save file pickle_protocol (int): The pickle data stream format. - Returns + Returns: None """ with open(file_path, "wb") as f: @@ -186,6 +186,7 @@ def load(file_path): return cloudpickle.load(f) def __eq__(self, other): + """Check for equality.""" if not isinstance(other, self.__class__): return False random_seed_eq = self.random_seed == other.random_seed @@ -198,9 +199,11 @@ def __eq__(self, other): return True def __str__(self): + """String representation of a component.""" return self.name def __repr__(self): + """String representation of a component.""" parameters_repr = ", ".join( [f"{key}={safe_repr(value)}" for key, value in self.parameters.items()] ) diff --git a/evalml/pipelines/components/component_base_meta.py b/evalml/pipelines/components/component_base_meta.py index f9ea2551fc..af7c2bbce4 100644 --- a/evalml/pipelines/components/component_base_meta.py +++ b/evalml/pipelines/components/component_base_meta.py @@ -1,3 +1,4 @@ +"""Metaclass that overrides creating a new component by wrapping methods with validators and setters.""" from functools import wraps from evalml.exceptions import ComponentNotYetFittedError diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py index c59c8c291d..1d7b41ae62 100644 --- a/evalml/pipelines/components/estimators/__init__.py +++ b/evalml/pipelines/components/estimators/__init__.py @@ -1,3 +1,4 @@ +"""EvalML estimator components.""" from .estimator import Estimator from .classifiers import ( LogisticRegressionClassifier, diff --git a/evalml/pipelines/components/estimators/classifiers/__init__.py b/evalml/pipelines/components/estimators/classifiers/__init__.py index 5c3baaf5ae..f978266685 100644 --- a/evalml/pipelines/components/estimators/classifiers/__init__.py +++ b/evalml/pipelines/components/estimators/classifiers/__init__.py @@ -1,3 +1,4 @@ +"""Classification model components.""" from .logistic_regression_classifier import LogisticRegressionClassifier from .rf_classifier import RandomForestClassifier from .xgboost_classifier import XGBoostClassifier diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 1aed5552b5..a18ec686d1 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -1,3 +1,4 @@ +"""Baseline classifier.""" import numpy as np import pandas as pd @@ -42,6 +43,15 @@ def __init__(self, strategy="mode", random_seed=0, **kwargs): ) def fit(self, X, y=None): + """Fits baseline classifier component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ if y is None: raise ValueError("Cannot fit Baseline classifier if y is None") X = infer_feature_types(X) @@ -58,6 +68,14 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using the baseline classification strategy. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.Series: Predicted values. + """ X = infer_feature_types(X) strategy = self.parameters["strategy"] if strategy == "mode": @@ -73,6 +91,14 @@ def predict(self, X): return infer_feature_types(predictions) def predict_proba(self, X): + """Make prediction probabilities using the baseline classification strategy. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted probability values. + """ X = infer_feature_types(X) strategy = self.parameters["strategy"] if strategy == "mode": diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index f6dc0e0ae9..7d4260d5b3 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -1,3 +1,4 @@ +"""CatBoost Classifier, a classifier that uses gradient-boosting on decision trees. CatBoost is an open-source library and natively supports categorical features.""" import copy import warnings @@ -99,6 +100,15 @@ def __init__( ) def fit(self, X, y=None): + """Fits CatBoost classifier component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ X = infer_feature_types(X) cat_cols = list(X.ww.select("category", return_schema=True).columns) self.input_feature_names = list(X.columns) @@ -111,6 +121,14 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using the fitted CatBoost classifier. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted values. + """ X = infer_feature_types(X) predictions = self._component_obj.predict(X) if predictions.ndim == 2 and predictions.shape[1] == 1: @@ -123,4 +141,5 @@ def predict(self, X): @property def feature_importance(self): + """Feature importance of fitted CatBoost classifier.""" return self._component_obj.get_feature_importance() diff --git a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py index 38994b77d1..5a3755afab 100644 --- a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py @@ -1,3 +1,4 @@ +"""Decision Tree Classifier.""" from sklearn.tree import DecisionTreeClassifier as SKDecisionTreeClassifier from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py index 99abb89f47..06fa98e725 100644 --- a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py @@ -1,3 +1,4 @@ +"""Elastic Net Classifier. Uses Logistic Regression with elasticnet penalty as the base estimator.""" import warnings import numpy as np @@ -82,11 +83,21 @@ def __init__( ) def fit(self, X, y): + """Fits ElasticNet classifier component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ warnings.filterwarnings("ignore", message="The max_iter was reached") return super().fit(X, y) @property def feature_importance(self): + """Feature importance for fitted ElasticNet classifier.""" coef_ = self._component_obj.coef_ # binary classification case if len(coef_) <= 2: diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py index c623051754..714ddf3a25 100644 --- a/evalml/pipelines/components/estimators/classifiers/et_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/et_classifier.py @@ -1,3 +1,4 @@ +"""Extra Trees Classifier.""" from sklearn.ensemble import ExtraTreesClassifier as SKExtraTreesClassifier from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py index 89e9321488..d6781e8514 100644 --- a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py @@ -1,3 +1,4 @@ +"""K-Nearest Neighbors Classifier.""" import numpy as np from sklearn.neighbors import KNeighborsClassifier as SKKNeighborsClassifier from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index 6328e0e9bb..0ffb48a41b 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -1,3 +1,4 @@ +"""LightGBM Classifier.""" import copy import numpy as np @@ -171,6 +172,15 @@ def _encode_labels(self, y): return y_encoded def fit(self, X, y=None): + """Fits LightGBM classifier component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ X = infer_feature_types(X) X_encoded = self._encode_categories(X, fit=True) y_encoded = self._encode_labels(y) @@ -178,6 +188,14 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using the fitted LightGBM classifier. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted values. + """ X_encoded = self._encode_categories(X) predictions = super().predict(X_encoded) if not self._label_encoder: @@ -188,5 +206,13 @@ def predict(self, X): return infer_feature_types(predictions) def predict_proba(self, X): + """Make prediction probabilities using the fitted LightGBM classifier. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted probability values. + """ X_encoded = self._encode_categories(X) return super().predict_proba(X_encoded) diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py index 6bc45859a0..02c86c0d5b 100644 --- a/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/logistic_regression_classifier.py @@ -1,3 +1,4 @@ +"""Logistic Regression Classifier.""" import numpy as np from sklearn.linear_model import LogisticRegression as SKLogisticRegression from skopt.space import Real @@ -80,6 +81,7 @@ def __init__( @property def feature_importance(self): + """Feature importance for fitted logistic regression classifier.""" coef_ = self._component_obj.coef_ # binary classification case if len(coef_) <= 2: diff --git a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py index 23ae50d2db..ee10927e7d 100644 --- a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py @@ -1,3 +1,4 @@ +"""Random Forest Classifier.""" from sklearn.ensemble import RandomForestClassifier as SKRandomForestClassifier from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index c7fcfb0830..efc896dec6 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -1,3 +1,4 @@ +"""Support Vector Machine Classifier.""" import numpy as np from sklearn.svm import SVC from skopt.space import Real @@ -72,7 +73,7 @@ def __init__( def feature_importance(self): """Feature importance only works with linear kernels. - If the kernel isn't linear, we return a numpy array of zeros + If the kernel isn't linear, we return a numpy array of zeros. """ if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index 20b20511cf..ed6b9f0c9a 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -1,3 +1,4 @@ +"""XGBoost Classifier.""" from skopt.space import Integer, Real from evalml.model_family import ModelFamily @@ -88,6 +89,15 @@ def _convert_bool_to_int(X): } def fit(self, X, y=None): + """Fits XGBoost classifier component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ X, y = super()._manage_woodwork(X, y) X.ww.set_types(self._convert_bool_to_int(X)) self.input_feature_names = list(X.columns) @@ -96,12 +106,28 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using the fitted XGBoost classifier. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted values. + """ X, _ = super()._manage_woodwork(X) X.ww.set_types(self._convert_bool_to_int(X)) X = _rename_column_names_to_numeric(X, flatten_tuples=False) return super().predict(X) def predict_proba(self, X): + """Make predictions using the fitted CatBoost classifier. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted values. + """ X, _ = super()._manage_woodwork(X) X.ww.set_types(self._convert_bool_to_int(X)) X = _rename_column_names_to_numeric(X, flatten_tuples=False) @@ -109,4 +135,5 @@ def predict_proba(self, X): @property def feature_importance(self): + """Feature importance of fitted XGBoost classifier.""" return self._component_obj.feature_importances_ diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index fb98f5c4dd..d1e377c21f 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -1,3 +1,4 @@ +"""A component that fits and predicts given data.""" from abc import abstractmethod from pandas.core.indexes import range @@ -17,7 +18,7 @@ class Estimator(ComponentBase): uses standard keyword arguments and calls `super().__init__()` with a parameters dict. You may also override the `fit`, `transform`, `fit_transform` and other methods in this class if appropriate. - To see some examples, check out the definitions of any Estimator component. + To see some examples, check out the definitions of any Estimator component subclass. Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. @@ -57,6 +58,15 @@ def _manage_woodwork(self, X, y=None): return X, y def fit(self, X, y=None): + """Fits estimator to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X, y = self._manage_woodwork(X, y) self.input_feature_names = list(X.columns) self._component_obj.fit(X, y) @@ -66,10 +76,10 @@ def predict(self, X): """Make predictions using selected features. Args: - X (pd.DataFrame, np.ndarray): Data of shape [n_samples, n_features] + X (pd.DataFrame): Data of shape [n_samples, n_features] - Returns - pd.Series: Predicted values + Returns: + pd.Series: Predicted values. """ try: X = infer_feature_types(X) @@ -88,8 +98,8 @@ def predict_proba(self, X): Args: X (pd.DataFrame, or np.ndarray): Features - Returns - pd.Series: Probability estimates + Returns: + pd.Series: Probability estimates. """ try: X = infer_feature_types(X) @@ -115,6 +125,7 @@ def feature_importance(self): ) def __eq__(self, other): + """Check for equality.""" return ( super().__eq__(other) and self.supported_problem_types == other.supported_problem_types diff --git a/evalml/pipelines/components/estimators/regressors/__init__.py b/evalml/pipelines/components/estimators/regressors/__init__.py index d298f4209c..32afcb83a1 100644 --- a/evalml/pipelines/components/estimators/regressors/__init__.py +++ b/evalml/pipelines/components/estimators/regressors/__init__.py @@ -1,3 +1,4 @@ +"""Regression model components.""" from .elasticnet_regressor import ElasticNetRegressor from .linear_regressor import LinearRegressor from .lightgbm_regressor import LightGBMRegressor diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 750033084d..9f4956ba39 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -1,3 +1,4 @@ +"""Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html.""" import numpy as np import pandas as pd from skopt.space import Integer @@ -157,6 +158,15 @@ def _format_dates(self, dates, X, y, predict=False): return X, y, None def fit(self, X, y=None): + """Fits ARIMA regressor to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ if y is None: raise ValueError("ARIMA Regressor requires y as input.") @@ -171,6 +181,15 @@ def fit(self, X, y=None): return self def predict(self, X, y=None): + """Make predictions using fitted ARIMA regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. + + Returns: + pd.Series: Predicted values. + """ X, y = self._manage_woodwork(X, y) dates, X = self._get_dates(X, y) X, y, fh_ = self._format_dates(dates, X, y, predict=True) diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index 0d9bb03889..59e3dc8ee4 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -1,3 +1,4 @@ +"""Baseline regressor that uses a simple strategy to make predictions. This is useful as a simple baseline regressor to compare with other regressors.""" import numpy as np import pandas as pd @@ -44,6 +45,15 @@ def __init__(self, strategy="mean", random_seed=0, **kwargs): ) def fit(self, X, y=None): + """Fits baseline regression component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ if y is None: raise ValueError("Cannot fit Baseline regressor if y is None") X = infer_feature_types(X) @@ -57,6 +67,14 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using the baseline regression strategy. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.Series: Predicted values. + """ X = infer_feature_types(X) predictions = pd.Series([self._prediction_value] * len(X)) return infer_feature_types(predictions) @@ -65,7 +83,7 @@ def predict(self, X): def feature_importance(self): """Returns importance associated with each feature. Since baseline regressors do not use input features to calculate predictions, returns an array of zeroes. - Returns - np.ndarray (float): An array of zeroes + Returns: + np.ndarray (float): An array of zeroes. """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index d795e7f115..26ff1ced86 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -1,3 +1,4 @@ +"""CatBoost Regressor, a regressor that uses gradient-boosting on decision trees. CatBoost is an open-source library and natively supports categorical features.""" import copy import warnings @@ -91,6 +92,15 @@ def __init__( ) def fit(self, X, y=None): + """Fits CatBoost regressor component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ X = infer_feature_types(X) cat_cols = list(X.ww.select("category", return_schema=True).columns) self.input_feature_names = list(X.columns) @@ -100,4 +110,5 @@ def fit(self, X, y=None): @property def feature_importance(self): + """Feature importance of fitted CatBoost regressor.""" return self._component_obj.get_feature_importance() diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index 4562bdb2f5..2c48b5e454 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -1,3 +1,4 @@ +"""Decision Tree Regressor.""" from sklearn.tree import DecisionTreeRegressor as SKDecisionTreeRegressor from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py index b20a86e05d..59325fd6df 100644 --- a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py @@ -1,3 +1,4 @@ +"""Elastic Net Regressor.""" from sklearn.linear_model import ElasticNet as SKElasticNet from skopt.space import Real @@ -62,4 +63,5 @@ def __init__( @property def feature_importance(self): + """Feature importance for fitted ElasticNet regressor.""" return self._component_obj.coef_ diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py index fd8dac3609..515fa90939 100644 --- a/evalml/pipelines/components/estimators/regressors/et_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/et_regressor.py @@ -1,3 +1,4 @@ +"""Extra Trees Regressor.""" from sklearn.ensemble import ExtraTreesRegressor as SKExtraTreesRegressor from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index c230e0d633..ac93b63cc2 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -1,3 +1,4 @@ +"""LightGBM Regressor.""" import copy import pandas as pd @@ -84,7 +85,6 @@ def __init__( random_seed=0, **kwargs, ): - parameters = { "boosting_type": boosting_type, "learning_rate": learning_rate, @@ -147,6 +147,15 @@ def _encode_categories(self, X, fit=False): return X_encoded def fit(self, X, y=None): + """Fits LightGBM regressor to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ X_encoded = self._encode_categories(X, fit=True) if y is not None: y = infer_feature_types(y) @@ -154,5 +163,13 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using fitted LightGBM regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.Series: Predicted values. + """ X_encoded = self._encode_categories(X) return super().predict(X_encoded) diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index 2d2fa05ec4..e7d20a1fd4 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -1,3 +1,4 @@ +"""Linear Regressor.""" from sklearn.linear_model import LinearRegression as SKLinearRegression from evalml.model_family import ModelFamily @@ -54,4 +55,5 @@ def __init__( @property def feature_importance(self): + """Feature importance for fitted linear regressor.""" return self._component_obj.coef_ diff --git a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py index de82806392..e494cdf31c 100644 --- a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py @@ -1,3 +1,4 @@ +"""Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.""" import copy import numpy as np @@ -46,7 +47,6 @@ def __init__( stan_backend="CMDSTANPY", **kwargs, ): - parameters = { "changepoint_prior_scale": changepoint_prior_scale, "seasonality_prior_scale": seasonality_prior_scale, @@ -77,6 +77,7 @@ def __init__( @staticmethod def build_prophet_df(X, y=None, date_column="ds"): + """Build the Prophet data to pass fit and predict on.""" if X is not None: X = copy.deepcopy(X) if y is not None: @@ -107,9 +108,17 @@ def build_prophet_df(X, y=None, date_column="ds"): return prophet_df def fit(self, X, y=None): + """Fits Prophet regressor component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ if X is None: X = pd.DataFrame() - X, y = super()._manage_woodwork(X, y) prophet_df = ProphetRegressor.build_prophet_df( @@ -120,9 +129,17 @@ def fit(self, X, y=None): return self def predict(self, X, y=None): + """Make predictions using fitted Prophet regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. + + Returns: + pd.Series: Predicted values. + """ if X is None: X = pd.DataFrame() - X = infer_feature_types(X) prophet_df = ProphetRegressor.build_prophet_df( @@ -134,6 +151,7 @@ def predict(self, X, y=None): return y_pred def get_params(self): + """Get parameters for the Prophet regressor.""" return self.__dict__["_parameters"] @property @@ -145,12 +163,9 @@ def feature_importance(self): def default_parameters(cls): """Returns the default parameters for this component. - Our convention is that Component.default_parameters == Component().parameters. - - Returns - dict: default parameters for this component. + Returns: + dict: Default parameters for this component. """ - parameters = { "changepoint_prior_scale": 0.05, "date_index": None, @@ -159,5 +174,4 @@ def default_parameters(cls): "seasonality_mode": "additive", "stan_backend": "CMDSTANPY", } - return parameters diff --git a/evalml/pipelines/components/estimators/regressors/rf_regressor.py b/evalml/pipelines/components/estimators/regressors/rf_regressor.py index 867c761115..41dd3c9ad4 100644 --- a/evalml/pipelines/components/estimators/regressors/rf_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/rf_regressor.py @@ -1,3 +1,4 @@ +"""Random Forest Regressor.""" from sklearn.ensemble import RandomForestRegressor as SKRandomForestRegressor from skopt.space import Integer diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index aaca1f1487..08a9211e0e 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -1,3 +1,4 @@ +"""Support Vector Machine Regressor.""" import numpy as np from sklearn.svm import SVR from skopt.space import Real @@ -54,9 +55,9 @@ def __init__(self, C=1.0, kernel="rbf", gamma="auto", random_seed=0, **kwargs): @property def feature_importance(self): - """Feature importance only works with linear kernels. + """Feature importance of fitted SVM regresor. - If the kernel isn't linear, we return a numpy array of zeros + Only works with linear kernels. If the kernel isn't linear, we return a numpy array of zeros. """ if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index 07fd120d3f..b7e3ffd833 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -1,3 +1,4 @@ +"""Time series estimator that predicts using the naive forecasting approach.""" import numpy as np import pandas as pd @@ -51,6 +52,15 @@ def __init__(self, gap=1, random_seed=0, **kwargs): ) def fit(self, X, y=None): + """Fits time series baseline estimator to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + """ if X is None: X = pd.DataFrame() X = infer_feature_types(X) @@ -58,6 +68,15 @@ def fit(self, X, y=None): return self def predict(self, X, y=None): + """Make predictions using fitted time series baseline estimator. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. + + Returns: + pd.Series: Predicted values. + """ if y is None: raise ValueError( "Cannot predict Time Series Baseline Estimator if y is None" @@ -70,6 +89,15 @@ def predict(self, X, y=None): return infer_feature_types(y) def predict_proba(self, X, y=None): + """Make prediction probabilities using fitted time series baseline estimator. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. + + Returns: + pd.DataFrame: Predicted probability values. + """ if y is None: raise ValueError( "Cannot predict Time Series Baseline Estimator if y is None" diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index 2fca4af0a9..28a67c3d69 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -1,3 +1,4 @@ +"""XGBoost Regressor.""" from skopt.space import Integer, Real from evalml.model_family import ModelFamily @@ -85,6 +86,15 @@ def _convert_bool_to_int(X): } def fit(self, X, y=None): + """Fits XGBoost regressor component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X, y = super()._manage_woodwork(X, y) X.ww.set_types(self._convert_bool_to_int(X)) self.input_feature_names = list(X.columns) @@ -93,6 +103,14 @@ def fit(self, X, y=None): return self def predict(self, X): + """Make predictions using fitted XGBoost regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.Series: Predicted values. + """ X, _ = super()._manage_woodwork(X) X.ww.set_types(self._convert_bool_to_int(X)) X = _rename_column_names_to_numeric(X, flatten_tuples=False) @@ -100,4 +118,5 @@ def predict(self, X): @property def feature_importance(self): + """Feature importance of fitted XGBoost regressor.""" return self._component_obj.feature_importances_ diff --git a/evalml/pipelines/components/transformers/imputers/__init__.py b/evalml/pipelines/components/transformers/imputers/__init__.py index 12e607d1cd..3257ca035f 100644 --- a/evalml/pipelines/components/transformers/imputers/__init__.py +++ b/evalml/pipelines/components/transformers/imputers/__init__.py @@ -1,3 +1,4 @@ +"""Components that impute missing values in the input data.""" from .per_column_imputer import PerColumnImputer from .simple_imputer import SimpleImputer from .imputer import Imputer diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index 4a74ba46cf..fc3541d35f 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -1,3 +1,4 @@ +"""Component that imputes missing data according to a specified imputation strategy.""" import pandas as pd from evalml.pipelines.components.transformers import Transformer @@ -82,7 +83,7 @@ def fit(self, X, y=None): X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] - Returns + Returns: self """ X = infer_feature_types(X) @@ -112,7 +113,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index 7beff86db1..5c708a1f60 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -1,3 +1,4 @@ +"""Component that imputes missing data according to a specified imputation strategy per column.""" from evalml.pipelines.components.transformers import Transformer from evalml.pipelines.components.transformers.imputers.simple_imputer import ( SimpleImputer, @@ -58,7 +59,7 @@ def fit(self, X, y=None): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. - Returns + Returns: self """ X = infer_feature_types(X) @@ -85,7 +86,7 @@ def transform(self, X, y=None): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 989195ec81..c9a7ec559c 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -1,3 +1,4 @@ +"""Component that imputes missing data according to a specified imputation strategy.""" import pandas as pd from sklearn.impute import SimpleImputer as SkImputer from woodwork.logical_types import NaturalLanguage @@ -44,7 +45,7 @@ def fit(self, X, y=None): X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] - Returns + Returns: self """ X = infer_feature_types(X) @@ -72,10 +73,10 @@ def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Args: - X (pd.DataFrame): Data to transform + X (pd.DataFrame): Data to transform. y (pd.Series, optional): Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X = infer_feature_types(X) @@ -111,7 +112,7 @@ def fit_transform(self, X, y=None): X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. - Returns + Returns: pd.DataFrame: Transformed X """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index b64ad3f065..2875086d2c 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -1,3 +1,4 @@ +"""Component that imputes missing target data according to a specified imputation strategy.""" from functools import wraps import pandas as pd @@ -72,7 +73,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. y (pd.Series, optional): The target training data of length [n_samples]. - Returns + Returns: self """ if y is None: @@ -96,10 +97,9 @@ def transform(self, X, y): X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. - Returns + Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ - if X is not None: X = infer_feature_types(X) if y is None: @@ -124,7 +124,7 @@ def fit_transform(self, X, y): X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. - Returns + Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index 252cedde3d..ac441e453c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -1,3 +1,4 @@ +"""Transformer to drop features whose percentage of NaN values exceeds a specified threshold.""" from evalml.pipelines.components.transformers import Transformer from evalml.utils import infer_feature_types @@ -30,6 +31,15 @@ def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs): ) def fit(self, X, y=None): + """Fits component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ pct_null_threshold = self.parameters["pct_null_threshold"] X_t = infer_feature_types(X) percent_null = X_t.isnull().mean() diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py index 7f98f4542e..b7cf01f0f0 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py @@ -1,3 +1,4 @@ +"""Transformer to drop rows specified by row indices.""" from evalml.pipelines.components.transformers import Transformer from evalml.utils import infer_feature_types @@ -24,6 +25,15 @@ def __init__(self, indices_to_drop=None, random_seed=0): super().__init__(parameters=None, component_obj=None, random_seed=random_seed) def fit(self, X, y=None): + """Fits component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X_t = infer_feature_types(X) y_t = infer_feature_types(y) if y is not None else None if self.indices_to_drop is not None: @@ -49,6 +59,15 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): + """Transforms data using fitted component. + + Args: + X (pd.DataFrame): Features. + y (pd.Series, optional): Target data. + + Returns: + (pd.DataFrame, pd.Series): Data with row indices dropped. + """ X_t = infer_feature_types(X) y_t = infer_feature_types(y) if y is not None else None if self.indices_to_drop is None or len(self.indices_to_drop) == 0: diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 6eb0d7ddb0..6462cc99f5 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -48,7 +48,7 @@ def fit(self, X, y=None): Args: X (pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features] - y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] + y (pd.Series): The target training data of length [n_samples] Returns self diff --git a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py index 35523ab29e..c47154402b 100644 --- a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py @@ -63,6 +63,15 @@ def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) def inverse_transform(self, y): + """Apply exponential to target data. + + Args: + y (pd.Series): Target variable. + + Returns: + pd.Series: Target with exponential applied. + + """ y_ww_inv = infer_feature_types(y) y_inv = y_ww_inv.apply(np.exp) if self.min <= 0: diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index 91dae0769f..63906153bb 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -1,3 +1,4 @@ +"""Component that removes trends from time series by fitting a polynomial to the data.""" import pandas as pd from skopt.space import Integer @@ -97,7 +98,6 @@ def inverse_transform(self, y): """Adds back fitted trend to target variable. Args: - X (pd.DataFrame, optional): Ignored. y (pd.Series): Target variable. Returns: diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 7ad27453c6..eded0e2bee 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -1,3 +1,4 @@ +"""Transformer that can automatically featurize text columns using featuretools' nlp_primitives.""" import string import featuretools as ft @@ -77,7 +78,7 @@ def fit(self, X, y=None): Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] - y (pd.Series, np.ndarray, optional): The target training data of length [n_samples] + y (pd.Series): The target training data of length [n_samples] Returns: self diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py index 537747afe2..7d3b5f73a9 100644 --- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py +++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py @@ -1,3 +1,4 @@ +"""Components that extract features from the input data.""" from abc import abstractmethod import featuretools as ft diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 4d1ec69103..53c7179ba6 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -40,7 +40,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. - Returns + Returns: pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) @@ -64,7 +64,7 @@ def fit_transform(self, X, y=None): X (pd.DataFrame): Data to fit and transform y (pd.Series): Target data - Returns + Returns: pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) @@ -98,6 +98,6 @@ def inverse_transform(self, y): Args: y (pd.Series): Target transformed by this component. - Returns - pd.Seriesø: Target without the transformation. + Returns: + pd.Series: Target without the transformation. """ diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 6aafb5b261..02e5bc7704 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -1,3 +1,4 @@ +"""Utility methods for EvalML components.""" import inspect from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin @@ -28,6 +29,7 @@ def _all_transformers(): def all_components(): + """Get all available components.""" return _all_estimators() + _all_transformers() @@ -35,12 +37,11 @@ def allowed_model_families(problem_type): """List the model types allowed for a particular problem type. Args: - problem_types (ProblemTypes or str): binary, multiclass, or regression + problem_types (ProblemTypes or str): ProblemTypes enum or string. - Returns - list[ModelFamily]: a list of model families + Returns: + list[ModelFamily]: A list of model families """ - estimators = [] problem_type = handle_problem_types(problem_type) for estimator in _all_estimators_used_in_search(): @@ -59,11 +60,11 @@ def get_estimators(problem_type, model_families=None): Can also optionally filter by a list of model types. Args: - problem_type (ProblemTypes or str): problem type to filter for - model_families (list[ModelFamily] or list[str]): model families to filter for + problem_type (ProblemTypes or str): Problem type to filter for. + model_families (list[ModelFamily] or list[str]): Model families to filter for. - Returns - list[class]: a list of estimator subclasses + Returns: + list[class]: A list of estimator subclasses. """ if model_families is not None and not isinstance(model_families, list): raise TypeError("model_families parameter is not a list.") @@ -103,9 +104,9 @@ def handle_component_class(component_class): will return that without modification. Args: - component (str, ComponentBase): input to be standardized + component (str, ComponentBase): Input to be standardized. - Returns + Returns: ComponentBase """ if isinstance(component_class, ComponentBase) or ( @@ -134,7 +135,7 @@ def __init__(self, pipeline): """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline. Args: - pipeline (PipelineBase or subclass obj): EvalML pipeline + pipeline (PipelineBase or subclass obj): EvalML pipeline. """ self.pipeline = pipeline self._estimator_type = "classifier" @@ -147,10 +148,10 @@ def fit(self, X, y): """Fits component to data. Args: - X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] - y (pd.Series, optional): the target training data of length [n_samples] + X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. - Returns + Returns: self """ self.classes_ = unique_labels(y) @@ -177,10 +178,10 @@ def predict_proba(self, X): """Make probability estimates for labels. Args: - X (pd.DataFrame): Features + X (pd.DataFrame): Features. - Returns - np.ndarray: Probability estimates + Returns: + np.ndarray: Probability estimates. """ return self.pipeline.predict_proba(X).to_numpy() @@ -192,7 +193,7 @@ def __init__(self, pipeline): """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline. Args: - pipeline (PipelineBase or subclass obj): EvalML pipeline + pipeline (PipelineBase or subclass obj): EvalML pipeline. """ self.pipeline = pipeline self._estimator_type = "regressor" @@ -207,7 +208,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] - Returns + Returns: self """ self.pipeline.fit(X, y) @@ -217,15 +218,16 @@ def predict(self, X): """Make predictions using selected features. Args: - X (pd.DataFrame): Features + X (pd.DataFrame): Features. - Returns - np.ndarray: Predicted values + Returns: + np.ndarray: Predicted values. """ return self.pipeline.predict(X).to_numpy() def scikit_learn_wrapped_estimator(evalml_obj): + """Wraps an EvalML object as a scikit-learn estimator.""" from evalml.pipelines.pipeline_base import PipelineBase """Wrap an EvalML pipeline or estimator in a scikit-learn estimator.""" @@ -261,9 +263,9 @@ def generate_component_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML component. Args: - element (component instance): The instance of the component to generate string Python code for + element (component instance): The instance of the component to generate string Python code for. - Returns + Returns: String representation of Python code that can be run separately in order to recreate the component instance. Does not include code for custom component implementation. """ @@ -299,8 +301,8 @@ def make_balancing_dictionary(y, sampling_ratio): y (pd.Series): Target data sampling_ratio (float): The balanced ratio we want the samples to meet - Returns - Dictionary where keys are the classes, and the corresponding values are the counts of samples + Returns: + dict : Dictionary where keys are the classes, and the corresponding values are the counts of samples for each class that will satisfy sampling_ratio. """ if sampling_ratio <= 0 or sampling_ratio > 1: diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 0c977e145b..dd0400abf9 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -1,3 +1,4 @@ +"""Base machine learning pipeline class.""" import copy import inspect import os @@ -38,7 +39,7 @@ class PipelineBase(ABC, metaclass=PipelineBaseMeta): - """Machine learning pipeline made out of transformers and an Estimator. + """Machine learning pipeline. Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. @@ -130,7 +131,8 @@ def name(self): def summary(self): """A short summary of the pipeline structure, describing the list of components used. - Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder + Example: + Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ component_graph = [ type(self.component_graph.component_instances[component]) @@ -180,7 +182,7 @@ def _make_component_dict_from_component_list(component_list): return component_dict def _validate_estimator_problem_type(self): - """Validates this pipeline's problem_type against that of the estimator from `self.component_graph`""" + """Validates this pipeline's problem_type against that of the estimator from `self.component_graph`.""" if ( self.estimator is None ): # Allow for pipelines that do not end with an estimator @@ -194,20 +196,22 @@ def _validate_estimator_problem_type(self): ) def __getitem__(self, index): + """Get an element in the component graph.""" if isinstance(index, slice): raise NotImplementedError("Slicing pipelines is currently not supported.") return self.component_graph[index] def __setitem__(self, index, value): + """Set an element in the component graph.""" raise NotImplementedError("Setting pipeline components is not supported.") def get_component(self, name): """Returns component by name. Args: - name (str): Name of component + name (str): Name of component. - Returns + Returns: Component: Component to return """ return self.component_graph.get_component(name) @@ -218,8 +222,8 @@ def describe(self, return_dict=False): Args: return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. - Returns - dict: Dictionary of all component parameters if return_dict is True, else None + Returns: + dict: Dictionary of all component parameters if return_dict is True, else None. """ log_title(logger, self.name) logger.info("Problem Type: {}".format(self.problem_type)) @@ -252,7 +256,7 @@ def compute_estimator_features(self, X, y=None): Args: X (pd.DataFrame): Input data to the pipeline to transform. - Returns + Returns: pd.DataFrame: New transformed features. """ return self.component_graph.compute_final_component_features(X, y=y) @@ -270,7 +274,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training data of length [n_samples]. - Returns + Returns: self """ @@ -281,7 +285,7 @@ def transform(self, X, y=None): X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. Defaults to None. - Returns + Returns: pd.DataFrame: Transformed output. """ return self.component_graph.transform(X, y) @@ -293,7 +297,7 @@ def predict(self, X, objective=None): X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. objective (Object or string): The objective to use to make predictions. - Returns + Returns: pd.Series: Predicted values. """ X = infer_feature_types(X) @@ -310,7 +314,7 @@ def score(self, X, y, objectives): y (pd.Series, np.ndarray): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns + Returns: dict: Ordered dictionary of objective scores. """ @@ -331,7 +335,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): Will be a DataFrame for multiclass problems and Series otherwise. Will be None for regression problems. objectives (list): List of objectives to score. - Returns + Returns: dict: Ordered dictionary with objectives and their scores. """ scored_successfully = OrderedDict() @@ -381,7 +385,7 @@ def model_family(self): def parameters(self): """Parameter dictionary for this pipeline. - Returns + Returns: dict: Dictionary of all component parameters. """ components = [ @@ -399,8 +403,8 @@ def parameters(self): def feature_importance(self): """Importance associated with each feature. Features dropped by the feature selection are excluded. - Returns - pd.DataFrame including feature names and their corresponding importance + Returns: + pd.DataFrame : Feature names and their corresponding importance """ feature_names = self.input_feature_names[self._estimator_name] importance = list( @@ -416,7 +420,7 @@ def graph(self, filepath=None): Args: filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. - Returns + Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ graphviz = import_or_raise( @@ -471,8 +475,8 @@ def graph_feature_importance(self, importance_threshold=0): Args: importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. - Returns - plotly.Figure, a bar graph showing features and their corresponding importance + Returns: + plotly.Figure : A bar graph showing features and their corresponding importance """ go = import_or_raise( "plotly.graph_objects", @@ -516,10 +520,10 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves pipeline at file path. Args: - file_path (str): location to save file - pickle_protocol (int): the pickle data stream format. + file_path (str): Location to save file. + pickle_protocol (int): The pickle data stream format. - Returns + Returns: None """ with open(file_path, "wb") as f: @@ -530,9 +534,9 @@ def load(file_path): """Loads pipeline at file path. Args: - file_path (str): location to load file + file_path (str): Location to load file. - Returns + Returns: PipelineBase object """ with open(file_path, "rb") as f: @@ -541,7 +545,7 @@ def load(file_path): def clone(self): """Constructs a new pipeline with the same components, parameters, and random state. - Returns + Returns: A new instance of this pipeline with identical components, parameters, and random state. """ return self.__class__( @@ -558,7 +562,8 @@ def new(self, parameters, random_seed=0): parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. - Returns + + Returns: A new instance of this pipeline with identical components. """ return self.__class__( @@ -569,6 +574,7 @@ def new(self, parameters, random_seed=0): ) def __eq__(self, other): + """Check for equality.""" if not isinstance(other, self.__class__): return False random_seed_eq = self.random_seed == other.random_seed @@ -587,9 +593,12 @@ def __eq__(self, other): return True def __str__(self): + """String representation of the component graph.""" return self.name def __repr__(self): + """String representation of the component graph.""" + def repr_component(parameters): return ", ".join( [f"'{key}': {safe_repr(value)}" for key, value in parameters.items()] @@ -623,9 +632,11 @@ def repr_component(parameters): return f"pipeline = {(type(self).__name__)}(component_graph={component_dict_str}, {additional_args_str})" def __iter__(self): + """Iterator for the component graph.""" return self def __next__(self): + """Get the next element in the component graph.""" return next(self.component_graph) def _get_feature_provenance(self): @@ -663,6 +674,7 @@ def _supports_fast_permutation_importance(self): @staticmethod def create_objectives(objectives): + """Create objective instances from a list of strings or objective classes.""" objective_instances = [] for objective in objectives: try: @@ -681,7 +693,7 @@ def can_tune_threshold_with_objective(self, objective): pipeline (PipelineBase): Binary classification pipeline. objective (ObjectiveBase): Primary AutoMLSearch objective. - Returns + Returns: bool: True if the pipeline threshold can be tuned. """ return ( @@ -693,10 +705,10 @@ def can_tune_threshold_with_objective(self, objective): def inverse_transform(self, y): """Apply component inverse_transform methods to estimator predictions in reverse order. - Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). + Components that implement inverse_transform are PolynomialDetrender, LogTransformer, LabelEncoder (tbd). Args: - y (pd.Series): Final component features + y (pd.Series): Final component features. """ return self.component_graph.inverse_transform(y) @@ -706,7 +718,7 @@ def get_hyperparameter_ranges(self, custom_hyperparameters): Args: custom_hyperparameters (dict): Custom hyperparameters for the pipeline. - Returns + Returns: dict: Dictionary of hyperparameter ranges for each component in the pipeline. """ hyperparameter_ranges = dict() diff --git a/evalml/pipelines/pipeline_meta.py b/evalml/pipelines/pipeline_meta.py index 02710fc2cb..e01f740337 100644 --- a/evalml/pipelines/pipeline_meta.py +++ b/evalml/pipelines/pipeline_meta.py @@ -1,3 +1,4 @@ +"""Metaclass that overrides creating a new pipeline by wrapping methods with validators and setters.""" from functools import wraps from evalml.exceptions import PipelineNotYetFittedError diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 81fb3088fe..26988e1f08 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -1,3 +1,4 @@ +"""Pipeline base class for time-series classification problems.""" import pandas as pd from .binary_classification_pipeline_mixin import ( @@ -38,7 +39,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training targets of length [n_samples] - Returns + Returns: self """ X, y = self._convert_to_woodwork(X, y) @@ -76,7 +77,7 @@ def predict(self, X, y=None, objective=None): y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. - Returns + Returns: pd.Series: Predicted values. """ if self.estimator is None: @@ -101,7 +102,7 @@ def predict_proba(self, X, y=None): Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. - Returns + Returns: pd.DataFrame: Probability estimates. """ if self.estimator is None: @@ -125,7 +126,7 @@ def score(self, X, y, objectives): y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns + Returns: dict: Ordered dictionary of objective scores. """ X, y = self._convert_to_woodwork(X, y) diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index c2078c53d0..2c549ba015 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -1,3 +1,4 @@ +"""Pipeline base class for time-series problems.""" import pandas as pd from evalml.pipelines import PipelineBase @@ -6,7 +7,6 @@ class TimeSeriesPipelineBase(PipelineBase, metaclass=PipelineBaseMeta): - """Pipeline base class for time series problems. Args: diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 1dd80461df..81773ca271 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -1,3 +1,4 @@ +"""Pipeline base class for time series regression problems.""" from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase from evalml.problem_types import ProblemTypes from evalml.utils import ( @@ -34,7 +35,7 @@ def predict(self, X, y=None, objective=None): y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. objective (Object or string): The objective to use to make predictions. - Returns + Returns: pd.Series: Predicted values. """ if self.estimator is None: @@ -61,7 +62,7 @@ def score(self, X, y, objectives): y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. - Returns + Returns: dict: Ordered dictionary of objective scores. """ X, y = self._convert_to_woodwork(X, y) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 325d2c9bbb..9022b1ab4f 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1,3 +1,4 @@ +"""Utility methods for EvalML pipelines.""" from woodwork import logical_types from .binary_classification_pipeline import BinaryClassificationPipeline @@ -56,21 +57,18 @@ def _get_preprocessing_components( X, y, problem_type, estimator_class, sampler_name=None ): - """ - Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. + """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. Args: - X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. problem_type (ProblemTypes or str): Problem type. estimator_class (class): A class which subclasses Estimator estimator for pipeline. sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None. - Returns + Returns: list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ - pp_components = [] if is_regression(problem_type): @@ -189,11 +187,9 @@ def make_pipeline( sampler_name=None, extra_components=None, ): - """ - Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. + """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. Args: - X (pd.DataFrame): The input data of shape [n_samples, n_features]. y (pd.Series): The target data of length [n_samples]. estimator (Estimator): Estimator for pipeline. @@ -204,8 +200,7 @@ def make_pipeline( Defaults to None extra_components (list(ComponentBase)): List of extra components to be added after preprocessing components. Defaults to None. - Returns - + Returns: PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. """ X = infer_feature_types(X) @@ -239,8 +234,8 @@ def generate_pipeline_code(element): Args: element (pipeline instance): The instance of the pipeline to generate string Python code - Returns - String representation of Python code that can be run separately in order to recreate the pipeline instance. + Returns: + str: String representation of Python code that can be run separately in order to recreate the pipeline instance. Does not include code for custom component implementation. """ # hold the imports needed and add code to end @@ -273,7 +268,7 @@ def _make_stacked_ensemble_pipeline( None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to -1. - Returns + Returns: Pipeline with appropriate stacked ensemble estimator. """ parameters = {} @@ -323,8 +318,8 @@ def _make_component_list_from_actions(actions): Args: actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components - Returns - List of components used to address the input actions + Returns: + list(ComponentBase): List of components used to address the input actions """ components = [] cols_to_drop = [] From 704918383dd1af0e53498e9b7aa8a80043c44c26 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 16:16:01 -0400 Subject: [PATCH 17/62] adding more fixes --- evalml/pipelines/component_graph.py | 2 +- evalml/pipelines/components/ensemble/__init__.py | 2 +- .../ensemble/sklearn_stacked_ensemble_base.py | 1 + .../transformers/dimensionality_reduction/__init__.py | 1 + .../transformers/dimensionality_reduction/pca.py | 1 + .../components/transformers/encoders/__init__.py | 1 + .../components/transformers/encoders/onehot_encoder.py | 1 + .../components/transformers/preprocessing/__init__.py | 1 + .../preprocessing/delayed_feature_transformer.py | 1 + .../transformers/preprocessing/featuretools.py | 1 + .../transformers/preprocessing/log_transformer.py | 1 + .../components/transformers/preprocessing/lsa.py | 8 ++++++++ .../transformers/preprocessing/text_transformer.py | 1 + .../components/transformers/samplers/base_sampler.py | 1 + .../components/transformers/samplers/oversampler.py | 4 ++-- .../components/transformers/samplers/undersampler.py | 10 ++++++++++ 16 files changed, 33 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index bd98ba71c0..e693533c32 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -683,7 +683,7 @@ def inverse_transform(self, y): Components that implement inverse_transform are PolynomialDetrender, LabelEncoder (tbd). Args: - y: (pd.Series): Final component features + y: (pd.Series): Final component features. """ data_to_transform = infer_feature_types(y) current_component = self.compute_order[-1] diff --git a/evalml/pipelines/components/ensemble/__init__.py b/evalml/pipelines/components/ensemble/__init__.py index 7a81cc653b..113ae623be 100644 --- a/evalml/pipelines/components/ensemble/__init__.py +++ b/evalml/pipelines/components/ensemble/__init__.py @@ -1,4 +1,4 @@ -# flake8:noqa +"""Ensemble components.""" from .sklearn_stacked_ensemble_base import SklearnStackedEnsembleBase from .sklearn_stacked_ensemble_classifier import SklearnStackedEnsembleClassifier from .sklearn_stacked_ensemble_regressor import SklearnStackedEnsembleRegressor diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index 06091b82fb..f689e7a601 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -1,3 +1,4 @@ +"""Stacked Ensemble Base Class.""" from evalml.exceptions import EnsembleMissingPipelinesError from evalml.model_family import ModelFamily from evalml.pipelines.components import Estimator diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/__init__.py b/evalml/pipelines/components/transformers/dimensionality_reduction/__init__.py index b062dd6b74..d228bb4ce3 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/__init__.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/__init__.py @@ -1,2 +1,3 @@ +"""Transformers that reduce the dimensionality of the input data.""" from .lda import LinearDiscriminantAnalysis from .pca import PCA diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 4a7ed43662..1889423967 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -1,3 +1,4 @@ +"""Component that reduces the number of features by using Principal Component Analysis (PCA).""" import pandas as pd from sklearn.decomposition import PCA as SkPCA from skopt.space import Real diff --git a/evalml/pipelines/components/transformers/encoders/__init__.py b/evalml/pipelines/components/transformers/encoders/__init__.py index 9a7fec579c..59ffdb13ca 100644 --- a/evalml/pipelines/components/transformers/encoders/__init__.py +++ b/evalml/pipelines/components/transformers/encoders/__init__.py @@ -1,2 +1,3 @@ +"""Components used to encode the input data.""" from .onehot_encoder import OneHotEncoder from .target_encoder import TargetEncoder diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 76d37bc586..e625e6241c 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -1,3 +1,4 @@ +"""A transformer that encodes categorical features in a one-hot numeric array.""" import numpy as np import pandas as pd import woodwork as ww diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py index d264b3f6c6..3828a47d2c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/__init__.py +++ b/evalml/pipelines/components/transformers/preprocessing/__init__.py @@ -1,3 +1,4 @@ +"""Preprocessing transformer components.""" from .datetime_featurizer import DateTimeFeaturizer from .drop_null_columns import DropNullColumns from .text_transformer import TextTransformer diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index a255b58c60..50fb4cb7e7 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -1,3 +1,4 @@ +"""Transformer that delays input features and target variable for time series problems.""" import pandas as pd from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from woodwork import logical_types diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 6462cc99f5..1140b7cbc1 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -1,3 +1,4 @@ +"""Featuretools DFS component that generates features for the input features.""" from featuretools import EntitySet, calculate_feature_matrix, dfs from evalml.pipelines.components.transformers.transformer import Transformer diff --git a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py index c47154402b..0cbcaf602f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/log_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/log_transformer.py @@ -1,3 +1,4 @@ +"""Component that applies a log transformation to the target data.""" import numpy as np from evalml.pipelines.components.transformers.transformer import ( diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 18bd97bc52..5464dba86a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -1,3 +1,4 @@ +"""Transformer to calculate the Latent Semantic Analysis Values of text input.""" import pandas as pd from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer @@ -28,6 +29,13 @@ def __init__(self, random_seed=0, **kwargs): super().__init__(random_seed=random_seed, **kwargs) def fit(self, X, y=None): + """Fits the input data. + + Args: + X (pd.DataFrame): The data to transform. + y (pd.Series, optional): Ignored. + + """ X = infer_feature_types(X) self._text_columns = self._get_text_columns(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py index 903333a4ed..e8f14428be 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py @@ -1,3 +1,4 @@ +"""Base class for all transformers working with text features.""" from evalml.pipelines.components.transformers import Transformer from evalml.utils.logger import get_logger diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 65164f53f3..864ed19fab 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -1,3 +1,4 @@ +"""Base Sampler component. Used as the base class of all sampler components.""" import copy from abc import abstractmethod diff --git a/evalml/pipelines/components/transformers/samplers/oversampler.py b/evalml/pipelines/components/transformers/samplers/oversampler.py index fd3d4191e9..ba68ab677d 100644 --- a/evalml/pipelines/components/transformers/samplers/oversampler.py +++ b/evalml/pipelines/components/transformers/samplers/oversampler.py @@ -1,3 +1,4 @@ +"""SMOTE Oversampler component. Will automatically select whether to use SMOTE, SMOTEN, or SMOTENC based on inputs to the component.""" from evalml.pipelines.components.transformers.samplers.base_sampler import ( BaseSampler, ) @@ -7,8 +8,7 @@ class Oversampler(BaseSampler): - """ - SMOTE Oversampler component. Will automatically select whether to use SMOTE, SMOTEN, or SMOTENC based on inputs to the component. + """SMOTE Oversampler component. Will automatically select whether to use SMOTE, SMOTEN, or SMOTENC based on inputs to the component. Arguments: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index 6cf4a6260d..9eb6291cc3 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -1,3 +1,4 @@ +"""An undersampling transformer to downsample the majority classes in the dataset.""" import pandas as pd from evalml.pipelines.components.transformers.samplers.base_sampler import ( @@ -69,6 +70,15 @@ def _initialize_sampler(self, X, y): self._component_obj = sampler def transform(self, X, y=None): + """Transforms the input data by sampling the data. + + Args: + X (pd.DataFrame): Training features. + y (pd.Series): Target. + + Returns: + pd.DataFrame, pd.Series: Transformed features and target. + """ X_ww, y_ww = self._prepare_data(X, y) self._initialize_sampler(X, y_ww) index_df = pd.Series(y_ww.index) From aee0e45a92cf547c19cbf57cfaa4aa7170bc5371 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 18:19:12 -0400 Subject: [PATCH 18/62] revert tests and clean more --- evalml/objectives/__init__.py | 1 + evalml/objectives/cost_benefit_matrix.py | 13 +++++---- evalml/objectives/objective_base.py | 1 + evalml/objectives/sensitivity_low_alert.py | 13 +++++---- evalml/objectives/utils.py | 1 + evalml/pipelines/component_graph.py | 5 ++++ .../sklearn_stacked_ensemble_classifier.py | 1 + .../sklearn_stacked_ensemble_regressor.py | 1 + .../components/transformers/__init__.py | 1 + .../transformers/column_selectors.py | 27 +++++++++++++++--- .../dimensionality_reduction/lda.py | 28 +++++++++++++++++++ .../dimensionality_reduction/pca.py | 27 ++++++++++++++++++ .../transformers/encoders/onehot_encoder.py | 23 ++++++++++----- .../transformers/encoders/target_encoder.py | 28 +++++++++++++++++++ .../feature_selection/__init__.py | 1 + .../feature_selection/feature_selector.py | 12 +++++++- .../rf_classifier_feature_selector.py | 1 + .../rf_regressor_feature_selector.py | 1 + .../preprocessing/datetime_featurizer.py | 12 +++++++- .../delayed_feature_transformer.py | 9 ++++++ .../transformers/samplers/__init__.py | 1 + .../transformers/samplers/base_sampler.py | 9 ++++++ .../transformers/samplers/oversampler.py | 18 +++++++++--- .../transformers/scalers/__init__.py | 1 + .../transformers/scalers/standard_scaler.py | 19 +++++++++++++ .../components/transformers/transformer.py | 1 + .../multiclass_classification_pipeline.py | 1 + evalml/pipelines/regression_pipeline.py | 14 ++++++++-- evalml/tests/automl_tests/dask_test_utils.py | 18 +++++------- .../parallel_tests/test_automl_dask.py | 10 +++---- .../parallel_tests/test_cf_engine.py | 21 +++++++++----- .../parallel_tests/test_dask_engine.py | 21 +++++++++----- evalml/tests/automl_tests/test_automl.py | 5 ++-- .../test_catboost_classifier.py | 2 +- .../test_catboost_regressor.py | 2 +- .../component_tests/test_lgbm_classifier.py | 2 +- .../component_tests/test_lgbm_regressor.py | 2 +- .../component_tests/test_simple_imputer.py | 5 +++- .../test_xgboost_classifier.py | 2 +- .../component_tests/test_xgboost_regressor.py | 2 +- evalml/tests/conftest.py | 14 ++++++---- .../data_checks_tests/test_data_check.py | 2 +- .../data_checks_tests/test_data_checks.py | 6 ++-- .../test_algorithms.py | 6 ++-- .../test_explainers.py | 18 ++++++------ .../test_partial_dependence.py | 7 ++--- .../test_permutation_importance.py | 7 ++--- .../test_binary_classification_objective.py | 6 ++-- .../test_cost_benefit_matrix.py | 8 +++--- .../tests/objective_tests/test_objectives.py | 2 +- .../pipeline_tests/test_component_graph.py | 12 ++++---- .../tests/utils_tests/test_woodwork_utils.py | 8 +++--- 52 files changed, 338 insertions(+), 120 deletions(-) diff --git a/evalml/objectives/__init__.py b/evalml/objectives/__init__.py index eecbaf9c3c..07d5d795c7 100644 --- a/evalml/objectives/__init__.py +++ b/evalml/objectives/__init__.py @@ -1,3 +1,4 @@ +"""EvalML standard and custom objectives.""" from .binary_classification_objective import BinaryClassificationObjective from .cost_benefit_matrix import CostBenefitMatrix from .fraud_cost import FraudCost diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index 9c4d790335..00c44238f5 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -1,3 +1,4 @@ +"""Cost-benefit matrix objective.""" import numpy as np from .binary_classification_objective import BinaryClassificationObjective @@ -9,10 +10,10 @@ class CostBenefitMatrix(BinaryClassificationObjective): """Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow. Args: - true_positive (float): Cost associated with true positive predictions - true_negative (float): Cost associated with true negative predictions - false_positive (float): Cost associated with false positive predictions - false_negative (float): Cost associated with false negative predictions + true_positive (float): Cost associated with true positive predictions. + true_negative (float): Cost associated with true negative predictions. + false_positive (float): Cost associated with false positive predictions. + false_negative (float): Cost associated with false negative predictions. """ name = "Cost Benefit Matrix" @@ -35,8 +36,8 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): """Calculates cost-benefit of the using the predicted and true values. Args: - y_predicted (pd.Series): Predicted labels - y_true (pd.Series): True labels + y_predicted (pd.Series): Predicted labels. + y_true (pd.Series): True labels. X (pd.DataFrame): Ignored. sample_weight (pd.DataFrame): Ignored. diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index e1f0347e36..8a0e0ac16e 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -1,3 +1,4 @@ +"""Base class for all objectives.""" from abc import ABC, abstractmethod import numpy as np diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index bf3839f19c..5746eff9e9 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -1,3 +1,4 @@ +"""Sensitivity at Low Alert Rates objective.""" import numpy as np from .binary_classification_objective import BinaryClassificationObjective @@ -8,6 +9,12 @@ class SensitivityLowAlert(BinaryClassificationObjective): + """Create instance of SensitivityLowAlert. + + Args: + alert_rate (float): percentage of top scores to classify as high risk. + """ + name = "Sensitivity at Low Alert Rates" greater_is_better = True score_needs_proba = False @@ -16,11 +23,6 @@ class SensitivityLowAlert(BinaryClassificationObjective): expected_range = [0, 1] def __init__(self, alert_rate=0.01): - """Create instance of SensitivityLowAlert. - - Args: - alert_rate (float): percentage of top scores to classify as high risk - """ if (alert_rate > 1) or (alert_rate < 0): raise ValueError("Alert rate is outside of valid range [0,1]") @@ -32,7 +34,6 @@ def decision_function(self, ypred_proba, **kwargs): Args: ypred_proba (pd.Series): Predicted probabilities """ - ypred_proba = self._standardize_input_type(ypred_proba) if len(ypred_proba.unique()) == 1: logger.debug( diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index 8a78684f17..cc49f5c78f 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -1,3 +1,4 @@ +"""Utility methods for EvalML objectives.""" from .objective_base import ObjectiveBase from evalml import objectives diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index e693533c32..d623492cb4 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -1,3 +1,4 @@ +"""Component graph for a pipeline as a directed acyclic graph (DAG).""" import inspect import warnings @@ -609,12 +610,14 @@ def generate_order(cls, component_dict): return compute_order def __getitem__(self, index): + """Get an element in the component graph.""" if isinstance(index, int): return self.get_component(self.compute_order[index]) else: return self.get_component(index) def __iter__(self): + """Iterator for the component graph.""" self._i = 0 return self @@ -632,6 +635,7 @@ def __next__(self): raise StopIteration def __eq__(self, other): + """Test for equality.""" if not isinstance(other, self.__class__): return False random_seed_eq = self.random_seed == other.random_seed @@ -644,6 +648,7 @@ def __eq__(self, other): return True def __repr__(self): + """String representation of a component graph.""" component_strs = [] for ( component_name, diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py index 9ce2d90647..7c982f6278 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_classifier.py @@ -1,3 +1,4 @@ +"""Scikit-learn Stacked Ensemble Classifier.""" from sklearn.ensemble import StackingClassifier from sklearn.model_selection import StratifiedKFold diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py index 37f8078b9e..8178b92fee 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_regressor.py @@ -1,3 +1,4 @@ +"""Scikit-learn Stacked Ensemble Regressor.""" from sklearn.ensemble import StackingRegressor from sklearn.model_selection import KFold diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index fae3c8de87..6f941b2343 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -1,3 +1,4 @@ +"""Components that transform data.""" from .transformer import Transformer from .encoders import OneHotEncoder, TargetEncoder from .feature_selection import ( diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index 9f85dc1b76..c6fb032dd0 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -1,3 +1,4 @@ +"""Initalizes an transformer that selects specified columns in input data.""" from abc import abstractmethod from evalml.pipelines.components.transformers import Transformer @@ -5,7 +6,7 @@ class ColumnSelector(Transformer): - """Initalizes an transformer that drops specified columns in input data. + """Initalizes an transformer that selects specified columns in input data. Args: columns (list(string)): List of column names, used to determine which columns to select. @@ -43,7 +44,7 @@ def fit(self, X, y=None): X (pd.DataFrame): Data to check. y (pd.Series, optional): Targets. - Returns + Returns: self """ X = infer_feature_types(X) @@ -51,6 +52,15 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): + """Transform data using fitted column selector component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X = infer_feature_types(X) self._check_input_for_columns(X) cols = self.parameters.get("columns") or [] @@ -81,7 +91,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. - Returns + Returns: pd.DataFrame: Transformed X. """ return super().transform(X, y) @@ -110,7 +120,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. - Returns + Returns: pd.DataFrame: Transformed X. """ return super().transform(X, y) @@ -148,6 +158,15 @@ def _modify_columns(self, cols, X, y=None): return X.ww.select(cols) def transform(self, X, y=None): + """Transforms data X by selecting columns. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Targets. + + Returns: + pd.DataFrame: Transformed X. + """ X = infer_feature_types(X) self._check_input_for_columns(X) cols = self.parameters.get("column_types") or [] diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index 40026f48bb..97d00bbdb3 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -1,3 +1,4 @@ +"""Component that reduces the number of features by using Linear Discriminant Analysis.""" import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as SkLDA @@ -34,6 +35,15 @@ def __init__(self, n_components=None, random_seed=0, **kwargs): ) def fit(self, X, y): + """Fits the LDA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X = infer_feature_types(X) if not is_all_numeric(X): raise ValueError("LDA input must be all numeric") @@ -48,6 +58,15 @@ def fit(self, X, y): return self def transform(self, X, y=None): + """Transform data using the fitted LDA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") @@ -60,6 +79,15 @@ def transform(self, X, y=None): return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) def fit_transform(self, X, y=None): + """Fit and transform data using the LDA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 1889423967..049e14bd9e 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -38,6 +38,15 @@ def __init__(self, variance=0.95, n_components=None, random_seed=0, **kwargs): ) def fit(self, X, y=None): + """Fits the PCA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X = infer_feature_types(X) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") @@ -45,6 +54,15 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): + """Transform data using fitted PCA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("PCA input must be all numeric") @@ -57,6 +75,15 @@ def transform(self, X, y=None): return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) def fit_transform(self, X, y=None): + """Fit and transform data using the PCA component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("PCA input must be all numeric") diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index e625e6241c..fe59199bae 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -93,6 +93,15 @@ def _get_cat_cols(X): return list(X.ww.select(include=["category"], return_schema=True).columns) def fit(self, X, y=None): + """Fits the one-hot encoder component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ top_n = self.parameters["top_n"] X = infer_feature_types(X) if self.features_to_encode is None: @@ -163,7 +172,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Features to one-hot encode. y (pd.Series): Ignored. - Returns + Returns: pd.DataFrame: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ X = infer_feature_types(X) @@ -203,9 +212,10 @@ def categories(self, feature_name): """Returns a list of the unique categories to be encoded for the particular feature, in order. Args: - feature_name (str): the name of any feature provided to one-hot encoder during fit - Returns - np.ndarray: the unique categories, in the same dtype as they were provided during fit + feature_name (str): The name of any feature provided to one-hot encoder during fit. + + Returns: + np.ndarray: The unique categories, in the same dtype as they were provided during fit. """ try: index = self.features_to_encode.index(feature_name) @@ -218,7 +228,6 @@ def categories(self, feature_name): @staticmethod def _make_name_unique(name, seen_before): """Helper to make the name unique.""" - if name not in seen_before: return name @@ -239,7 +248,7 @@ def _get_feature_names(self): For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". - Returns + Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. """ self._features_to_drop = [] @@ -286,7 +295,7 @@ def get_feature_names(self): For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". - Returns + Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. """ feature_names = self._get_feature_names() diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index d16e4d437c..6abfcdf94e 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -1,3 +1,4 @@ +"""A transformer that encodes categorical features into target encodings.""" import pandas as pd from woodwork.logical_types import Categorical @@ -76,9 +77,27 @@ def __init__( ) def fit(self, X, y): + """Fits the target encoder. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ return super().fit(X, y) def transform(self, X, y=None): + """Transform data using the fitted target encoder. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X_ww = infer_feature_types(X) if y is not None: y = infer_feature_types(y) @@ -89,6 +108,15 @@ def transform(self, X, y=None): ) def fit_transform(self, X, y): + """Fit and transform data using the target encoder. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ return self.fit(X, y).transform(X, y) def get_feature_names(self): diff --git a/evalml/pipelines/components/transformers/feature_selection/__init__.py b/evalml/pipelines/components/transformers/feature_selection/__init__.py index 6b1d4bb736..bf9dbfcd86 100644 --- a/evalml/pipelines/components/transformers/feature_selection/__init__.py +++ b/evalml/pipelines/components/transformers/feature_selection/__init__.py @@ -1,3 +1,4 @@ +"""Components that select features.""" from .feature_selector import FeatureSelector from .rf_classifier_feature_selector import RFClassifierSelectFromModel from .rf_regressor_feature_selector import RFRegressorSelectFromModel diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index 49557214f9..f13ab921a2 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -1,3 +1,4 @@ +"""Component that selects top features based on importance weights.""" import pandas as pd from evalml.exceptions import MethodPropertyNotFoundError @@ -39,7 +40,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform. y (pd.Series, optional): Target data. Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X_ww = infer_feature_types(X) @@ -63,4 +64,13 @@ def transform(self, X, y=None): ) def fit_transform(self, X, y=None): + """Fit and transform data using the feature selector. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py index b8917c470e..3672cd1f72 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py @@ -1,3 +1,4 @@ +"""Component that selects top features based on importance weights using a Random Forest classifier.""" import numpy as np from sklearn.ensemble import RandomForestClassifier as SKRandomForestClassifier from sklearn.feature_selection import SelectFromModel as SkSelect diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py index 9462447c91..e9b8a1f0d3 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py @@ -1,3 +1,4 @@ +"""Component that selects top features based on importance weights using a Random Forest regresor.""" import numpy as np from sklearn.ensemble import RandomForestRegressor as SKRandomForestRegressor from sklearn.feature_selection import SelectFromModel as SkSelect diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index 82aaffc062..a64fa8ee63 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -1,3 +1,4 @@ +"""Transformer that can automatically extract features from datetime columns.""" from evalml.pipelines.components.transformers import Transformer from evalml.utils import infer_feature_types @@ -110,6 +111,15 @@ def __init__( ) def fit(self, X, y=None): + """Fit the datetime featurizer component. + + Args: + X (pd.DataFrame): Input features. + y (pd.Series, optional): Target data. Ignored. + + Returns: + self + """ X = infer_feature_types(X) self._date_time_col_names = list( X.ww.select("datetime", return_schema=True).columns @@ -120,7 +130,7 @@ def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns. Args: - X (pd.DataFrame): Data to transform + X (pd.DataFrame): Input features. y (pd.Series, optional): Ignored. Returns diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index 50fb4cb7e7..1b24cd81d8 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -132,4 +132,13 @@ def transform(self, X, y=None): return X_ww def fit_transform(self, X, y): + """Fit the component and transform the input data. + + Args: + X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. + y (pd.Series, or None): Target. + + Returns + pd.DataFrame: Transformed X. + """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/samplers/__init__.py b/evalml/pipelines/components/transformers/samplers/__init__.py index 949b5b2041..056ead0016 100644 --- a/evalml/pipelines/components/transformers/samplers/__init__.py +++ b/evalml/pipelines/components/transformers/samplers/__init__.py @@ -1,2 +1,3 @@ +"""Sampler components.""" from .undersampler import Undersampler from .oversampler import Oversampler diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 864ed19fab..7dced470f7 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -126,4 +126,13 @@ def _dictionary_to_params(self, sampling_dict, y): return param_copy def fit_transform(self, X, y): + """Fit and transform data using the sampler component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + (pd.DataFrame, pd.Series): Transformed data. + """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/samplers/oversampler.py b/evalml/pipelines/components/transformers/samplers/oversampler.py index ba68ab677d..328f021e52 100644 --- a/evalml/pipelines/components/transformers/samplers/oversampler.py +++ b/evalml/pipelines/components/transformers/samplers/oversampler.py @@ -10,7 +10,7 @@ class Oversampler(BaseSampler): """SMOTE Oversampler component. Will automatically select whether to use SMOTE, SMOTEN, or SMOTENC based on inputs to the component. - Arguments: + Args: sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class and the values responding to the number of samples. Defaults to 0.25. @@ -51,6 +51,15 @@ def __init__( ) def fit(self, X, y): + """Fits oversampler to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + """ X_ww, y_ww = self._prepare_data(X, y) self.sampler = self._get_best_oversampler(X_ww) @@ -79,10 +88,11 @@ def _get_categorical(self, X): self._parameters["categorical_features"] = self.categorical_features def _initialize_sampler(self, X, y): - """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. If a sampler_ratio_dict is provided, we will opt to use that. - Otherwise, we use will create the sampler_ratio_dict dictionary. + """Initializes the oversampler with the given sampler_ratio or sampler_ratio_dict. + + If a sampler_ratio_dict is provided, we will opt to use that. Otherwise, we use will create the sampler_ratio_dict dictionary. - Arguments: + Args: X (pd.DataFrame): Input features. y (pd.Series): Target. """ diff --git a/evalml/pipelines/components/transformers/scalers/__init__.py b/evalml/pipelines/components/transformers/scalers/__init__.py index ac94082ea8..aa9efad855 100644 --- a/evalml/pipelines/components/transformers/scalers/__init__.py +++ b/evalml/pipelines/components/transformers/scalers/__init__.py @@ -1 +1,2 @@ +"""Components that scale input data.""" from .standard_scaler import StandardScaler diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index 02441cc2be..68dda3c206 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -1,3 +1,4 @@ +"""A transformer that standardizes input features by removing the mean and scaling to unit variance.""" import pandas as pd from sklearn.preprocessing import StandardScaler as SkScaler from woodwork.logical_types import Boolean, Categorical, Integer @@ -30,6 +31,15 @@ def __init__(self, random_seed=0, **kwargs): ) def transform(self, X, y=None): + """Transform data using the fitted standard scaler. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X = infer_feature_types(X) original_ltypes = X.ww.schema.logical_types X = X.ww.select_dtypes(exclude=["datetime"]) @@ -40,6 +50,15 @@ def transform(self, X, y=None): ) def fit_transform(self, X, y=None): + """Fit and transform data using the standard scaler component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ X = infer_feature_types(X) X = X.select_dtypes(exclude=["datetime"]) return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 53c7179ba6..c66595d480 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -1,3 +1,4 @@ +"""A component that may or may not need fitting that transforms data. These components are used before an estimator.""" from abc import abstractmethod import pandas as pd diff --git a/evalml/pipelines/multiclass_classification_pipeline.py b/evalml/pipelines/multiclass_classification_pipeline.py index 811e9b05ec..72fbab8516 100644 --- a/evalml/pipelines/multiclass_classification_pipeline.py +++ b/evalml/pipelines/multiclass_classification_pipeline.py @@ -1,3 +1,4 @@ +"""Pipeline subclass for all multiclass classification pipelines.""" from evalml.pipelines.classification_pipeline import ClassificationPipeline from evalml.problem_types import ProblemTypes diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 01fc3edf60..eafd8d149d 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -1,3 +1,4 @@ +"""Pipeline subclass for all regression pipelines.""" from evalml.pipelines import PipelineBase from evalml.problem_types import ProblemTypes from evalml.utils import infer_feature_types @@ -28,7 +29,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training data of length [n_samples] - Returns + Returns: self """ X = infer_feature_types(X) @@ -47,7 +48,7 @@ def score(self, X, y, objectives): y (pd.Series, or np.ndarray): True values of length [n_samples] objectives (list): Non-empty list of objectives to score on - Returns + Returns: dict: Ordered dictionary of objective scores """ objectives = self.create_objectives(objectives) @@ -57,6 +58,15 @@ def score(self, X, y, objectives): ) def predict(self, X, objective=None): + """Make predictions using selected features. + + Args: + X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. + objective (Object or string): The objective to use to make predictions. + + Returns: + pd.Series: Predicted values. + """ X = infer_feature_types(X) predictions = self.component_graph.predict(X) predictions = self.inverse_transform(predictions) diff --git a/evalml/tests/automl_tests/dask_test_utils.py b/evalml/tests/automl_tests/dask_test_utils.py index cd393d32ab..3cb16111f9 100644 --- a/evalml/tests/automl_tests/dask_test_utils.py +++ b/evalml/tests/automl_tests/dask_test_utils.py @@ -9,7 +9,7 @@ # Top-level replacement for AutoML object to supply data for testing purposes. def err_call(*args, **kwargs): - """No-op.""" + """No-op""" data_splitter = TrainingValidationSplit() @@ -96,11 +96,9 @@ def score(self, X, y, objectives): class DaskPipelineSlow(BinaryClassificationPipeline): - """Pipeline for testing whose fit() should take longer than the fast pipeline. - - This exists solely to test AutoMLSearch termination and not complete - fitting. - """ + """Pipeline for testing whose fit() should take longer than the + fast pipeline. This exists solely to test AutoMLSearch termination + and not complete fitting.""" component_graph = ["Baseline Classifier"] custom_name = "SlowPipeline" @@ -125,11 +123,9 @@ def fit(self, X, y): class DaskPipelineFast(BinaryClassificationPipeline): - """Pipeline for testing whose fit() should complete before the slow pipeline. - - This exists solely to test AutoMLSearch termination and complete - fitting. - """ + """Pipeline for testing whose fit() should complete before the + slow pipeline. This exists solely to test AutoMLSearch termination + and complete fitting.""" component_graph = ["Baseline Classifier"] custom_name = "FastPipeline" diff --git a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py index 9b4f4fd1fc..fa5102ca9e 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py @@ -51,12 +51,12 @@ def process_pool(): def _get_engine_support(parallel_engine_type, thread_pool, cluster): - """Helper function to return the proper combination of resource pool, client class and engine class for testing purposes. + """Helper function to return the proper combination of resource pool, client class and + engine class for testing purposes. - e.g. The CFEngine can be run either with a ThreadPoolExecutor or a - ProcessPoolExecutor, so _get_engine_support("CFEngine", - thread_pool, cluster) returns a tuple of (ThreadPoolExecutor, - cf.Client, cf.CFEngine) + e.g. The CFEngine can be run either with a ThreadPoolExecutor or a ProcessPoolExecutor, + so _get_engine_support("CFEngine", thread_pool, cluster) returns a + tuple of (ThreadPoolExecutor, cf.Client, cf.CFEngine) """ if parallel_engine_type == "CFEngine": resources = thread_pool diff --git a/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py b/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py index 2e570cef7d..178822f32f 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py +++ b/evalml/tests/automl_tests/parallel_tests/test_cf_engine.py @@ -59,7 +59,8 @@ def test_init(process_pool): def test_submit_training_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) with CFClient(pool) as client: @@ -90,7 +91,8 @@ def test_submit_training_job_single( def test_submit_training_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that training multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that training multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) with CFClient(pool) as client: @@ -134,7 +136,8 @@ def fit_pipelines(pipelines, engine): def test_submit_evaluate_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that evaluating a single pipeline using the parallel engine produces the same results as simply running the evaluate_pipeline function.""" + """Test that evaluating a single pipeline using the parallel engine produces the + same results as simply running the evaluate_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -188,7 +191,8 @@ def test_submit_evaluate_job_single( def test_submit_evaluate_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that evaluating multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that evaluating multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -244,7 +248,8 @@ def eval_pipelines(pipelines, engine): def test_submit_scoring_job_single( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that scoring a single pipeline using the parallel engine produces the same results as simply running the score_pipeline function.""" + """Test that scoring a single pipeline using the parallel engine produces the + same results as simply running the score_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -283,7 +288,8 @@ def test_submit_scoring_job_single( def test_submit_scoring_jobs_multiple( X_y_binary_cls, pool_type, thread_pool, process_pool ): - """Test that scoring multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that scoring multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -338,7 +344,8 @@ def score_pipelines(pipelines, engine): @pytest.mark.parametrize("pool_type", ["threads"]) def test_cancel_job(X_y_binary_cls, pool_type, thread_pool, process_pool): - """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls pool = get_pool(pool_type, thread_pool, process_pool) diff --git a/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py b/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py index ec424765f9..33c18ccd92 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py +++ b/evalml/tests/automl_tests/parallel_tests/test_dask_engine.py @@ -42,7 +42,8 @@ def test_init(cluster): def test_submit_training_job_single(X_y_binary_cls, cluster): - """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls with Client(cluster) as client: engine = DaskEngine(client=client) @@ -69,7 +70,8 @@ def test_submit_training_job_single(X_y_binary_cls, cluster): def test_submit_training_jobs_multiple(X_y_binary_cls, cluster): - """Test that training multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that training multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls with Client(cluster) as client: pipelines = [ @@ -109,7 +111,8 @@ def fit_pipelines(pipelines, engine): def test_submit_evaluate_job_single(X_y_binary_cls, cluster): - """Test that evaluating a single pipeline using the parallel engine produces the same results as simply running the evaluate_pipeline function.""" + """Test that evaluating a single pipeline using the parallel engine produces the + same results as simply running the evaluate_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -159,7 +162,8 @@ def test_submit_evaluate_job_single(X_y_binary_cls, cluster): def test_submit_evaluate_jobs_multiple(X_y_binary_cls, cluster): - """Test that evaluating multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that evaluating multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -211,7 +215,8 @@ def eval_pipelines(pipelines, engine): def test_submit_scoring_job_single(X_y_binary_cls, cluster): - """Test that scoring a single pipeline using the parallel engine produces the same results as simply running the score_pipeline function.""" + """Test that scoring a single pipeline using the parallel engine produces the + same results as simply running the score_pipeline function.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -246,7 +251,8 @@ def test_submit_scoring_job_single(X_y_binary_cls, cluster): def test_submit_scoring_jobs_multiple(X_y_binary_cls, cluster): - """Test that scoring multiple pipelines using the parallel engine produces the same results as the sequential engine.""" + """Test that scoring multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = X_y_binary_cls X.ww.init() y = ww.init_series(y) @@ -299,7 +305,8 @@ def score_pipelines(pipelines, engine): def test_cancel_job(X_y_binary_cls, cluster): - """Test that training a single pipeline using the parallel engine produces the same results as simply running the train_pipeline function.""" + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = X_y_binary_cls with Client(cluster) as client: diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 674888cae0..4860fa11a1 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1640,7 +1640,6 @@ def __init__(self, k, starting_index): def __call__(self): """Raises KeyboardInterrupt on the kth call. - Arguments are ignored but included to meet the call back API. """ if self.n_calls == self.k: @@ -2001,7 +2000,7 @@ def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) def fit(self, *args, **kwargs): - """Mocking fit.""" + """Mocking fit""" class Pipeline1(DummyPipeline): custom_name = "Pipeline1" @@ -2168,7 +2167,7 @@ def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) def fit(self, *args, **kwargs): - """Mocking fit.""" + """Mocking fit""" additional_objectives = None if custom_additional_objective: diff --git a/evalml/tests/component_tests/test_catboost_classifier.py b/evalml/tests/component_tests/test_catboost_classifier.py index e8f33d2e9f..d19eb387a4 100644 --- a/evalml/tests/component_tests/test_catboost_classifier.py +++ b/evalml/tests/component_tests/test_catboost_classifier.py @@ -10,7 +10,7 @@ def test_catboost_classifier_random_seed_bounds_seed(X_y_binary): - """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_catboost_regressor.py b/evalml/tests/component_tests/test_catboost_regressor.py index 5acbf7fdda..84244c56ad 100644 --- a/evalml/tests/component_tests/test_catboost_regressor.py +++ b/evalml/tests/component_tests/test_catboost_regressor.py @@ -10,7 +10,7 @@ def test_catboost_regressor_random_seed_bounds_seed(X_y_regression): - """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure catboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index f121ed8061..1766024f0a 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -28,7 +28,7 @@ def test_problem_types(): def test_lightgbm_classifier_random_seed_bounds_seed(X_y_binary): - """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 983b7514b4..1b20b23bd4 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -23,7 +23,7 @@ def test_problem_types(): def test_lightgbm_regressor_random_seed_bounds_seed(X_y_regression): - """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure lightgbm's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index f6d4650070..0298d97357 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -413,7 +413,10 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components( def test_component_handles_pre_init_ww(): - """Test to determine whether SimpleImputer can handle a Woodwork-inited DataFrame with partially null and fully null columns (post Woodwork 0.5.1) and still perform the expected behavior.""" + """Test to determine whether SimpleImputer can handle + a Woodwork-inited DataFrame with partially null and fully + null columns (post Woodwork 0.5.1) and still perform the + expected behavior.""" df = pd.DataFrame( {"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]} ) diff --git a/evalml/tests/component_tests/test_xgboost_classifier.py b/evalml/tests/component_tests/test_xgboost_classifier.py index 846708e018..9357572e96 100644 --- a/evalml/tests/component_tests/test_xgboost_classifier.py +++ b/evalml/tests/component_tests/test_xgboost_classifier.py @@ -13,7 +13,7 @@ def test_xgboost_classifier_random_seed_bounds_seed(X_y_binary): - """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/component_tests/test_xgboost_regressor.py b/evalml/tests/component_tests/test_xgboost_regressor.py index 87c2c764c3..9b2e809fbf 100644 --- a/evalml/tests/component_tests/test_xgboost_regressor.py +++ b/evalml/tests/component_tests/test_xgboost_regressor.py @@ -12,7 +12,7 @@ def test_xgboost_regressor_random_seed_bounds_seed(X_y_regression): - """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds.""" + """ensure xgboost's RNG doesn't fail for the min/max bounds we support on user-inputted random seeds""" X, y = X_y_regression col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index a55921e554..b80661d0bf 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1039,14 +1039,14 @@ def churn_local(): @pytest.fixture def mock_imbalanced_data_X_y(): - """Helper function to return an imbalanced binary or multiclass dataset.""" + """Helper function to return an imbalanced binary or multiclass dataset""" def _imbalanced_data_X_y(problem_type, categorical_columns, size): """ "Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns. For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. - Args: + Arguments: problem_type (str): Either 'binary' or 'multiclass' categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 @@ -1106,7 +1106,7 @@ class _AutoMLTestEnv: def __init__(self, problem_type): """Create a test environment. - Args: + Arguments: problem_type (str): The problem type corresponding to the search class you want to test. Attributes: @@ -1153,7 +1153,8 @@ def _patch_method(self, method, side_effect, return_value, pipeline_class_str=No return patch(pipeline_class_str + "." + method, **kwargs) def _reset_mocks(self): - """Set the mocks to None before running a computation so that we can prevent users from trying to access them before leaving the context manager.""" + """Set the mocks to None before running a computation so that we can prevent users from trying to access + them before leaving the context manager.""" self._mock_fit = None self._mock_tell = None self._mock_score = None @@ -1204,9 +1205,10 @@ def test_context( predict_proba_return_value=None, optimize_threshold_return_value=0.2, ): - """A context manager for creating an environment that patches time-consuming pipeline methods. Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. + """A context manager for creating an environment that patches time-consuming pipeline methods. + Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. - Args: + Arguments: score_return_value: Passed as the return_value argument of the pipeline.score patch. mock_score_side_effect: Passed as the side_effect argument of the pipeline.score patch. Takes precedence over score_return_value. diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 58393ab572..10099a9992 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -22,7 +22,7 @@ def test_data_check_name(mock_data_check_class): assert mock_data_check_class.name == "MockDataCheck" class Funky_Name1DataCheck(mock_data_check_class): - """Mock data check with a funky name.""" + """Mock data check with a funky name""" assert Funky_Name1DataCheck().name == "Funky_Name1DataCheck" assert Funky_Name1DataCheck.name == "Funky_Name1DataCheck" diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 71156f931c..1c495ee5cb 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -564,7 +564,7 @@ class MockCheck(DataCheck): name = "mock_check" def __init__(self, foo, bar, baz=3): - """Mock init.""" + """Mock init""" def validate(self, X, y=None): """Mock validate.""" @@ -574,7 +574,7 @@ class MockCheck2(DataCheck): name = "MockCheck" def __init__(self, foo, bar, baz=3): - """Mock init.""" + """Mock init""" def validate(self, X, y=None): """Mock validate.""" @@ -587,7 +587,7 @@ def validate(self, X, y=None): [MockCheck], {"mock_check": 1}, DataCheckInitError, - "Args: for mock_check were not in a dictionary. Received 1.", + "Parameters for mock_check were not in a dictionary. Received 1.", ), ( [MockCheck], diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py index 68c99a3ad0..558c37ff68 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py @@ -33,10 +33,8 @@ def make_test_pipeline(estimator, base_class): """Make an estimator-only pipeline. - - This is helps test the exceptions raised in _compute_shap_values - without having to use make_pipeline (which needs training data to be - passed in). + This is helps test the exceptions raised in _compute_shap_values without having to use make_pipeline + (which needs training data to be passed in). """ class Pipeline(base_class): diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index aabe0d592b..932a03f023 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -63,7 +63,7 @@ def test_error_metrics(): ( pd.DataFrame({"a": range(15)}), pd.Series(range(12)), - "^Args: y_true and input_features must have the same number of data points.", + "^Parameters y_true and input_features must have the same number of data points.", ), ] @@ -152,7 +152,7 @@ def test_output_format_checked(): regression_best_worst_answer = """Test Pipeline Name - Args: go here + Parameters go here Best 1 of 1 @@ -220,7 +220,7 @@ def test_output_format_checked(): no_best_worst_answer = """Test Pipeline Name - Args: go here + Parameters go here 1 of 2 @@ -253,7 +253,7 @@ def test_output_format_checked(): binary_best_worst_answer = """Test Pipeline Name - Args: go here + Parameters go here Best 1 of 1 @@ -339,7 +339,7 @@ def test_output_format_checked(): multiclass_best_worst_answer = """Test Pipeline Name - Args: go here + Parameters go here Best 1 of 1 @@ -414,7 +414,7 @@ def test_output_format_checked(): multiclass_no_best_worst_answer = """Test Pipeline Name - Args: go here + Parameters go here 1 of 2 @@ -846,7 +846,7 @@ def test_explain_predictions_best_worst_and_explain_predictions( } pipeline = MagicMock() - pipeline.parameters = "Args: go here" + pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index) pipeline.problem_type = problem_type pipeline.name = "Test Pipeline Name" @@ -958,7 +958,7 @@ def _add_custom_index(answer, index_best, index_worst, output_format): regression_custom_metric_answer = """Test Pipeline Name - Args: go here + Parameters go here Best 1 of 1 @@ -1032,7 +1032,7 @@ def test_explain_predictions_best_worst_custom_metric( else {"explanations": ["explanation_dictionary_goes_here"]} ) pipeline = MagicMock() - pipeline.parameters = "Args: go here" + pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [5, 6]}) pipeline.problem_type = ProblemTypes.REGRESSION pipeline.name = "Test Pipeline Name" diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 8e7e817a51..d057b496e1 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1311,10 +1311,9 @@ def test_graph_partial_dependence_ice_plot_two_way_error( def test_partial_dependence_scale_error(): - """Test to catch the case when the scale of the features is so small that the 5th and 95th percentiles are too close to each other. - - This is an sklearn exception. - """ + """Test to catch the case when the scale of the features is so small + that the 5th and 95th percentiles are too close to each other. This is + an sklearn exception.""" pl = RegressionPipeline(["Random Forest Regressor"]) X = pd.DataFrame({"a": list(range(30)), "b": list(range(-10, 20))}) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 1ea105c804..f4e31d561b 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -22,10 +22,9 @@ class DoubleColumns(Transformer): """Custom transformer for testing permutation importance implementation. - We don't have any transformers that create features that you can - repeatedly "stack" on the previous output. That being said, I want - to test that our implementation can handle that case in the event we - add a transformer like that in the future. + We don't have any transformers that create features that you can repeatedly "stack" on the previous output. + That being said, I want to test that our implementation can handle that case in the event we add a transformer like + that in the future. """ name = "DoubleColumns" diff --git a/evalml/tests/objective_tests/test_binary_classification_objective.py b/evalml/tests/objective_tests/test_binary_classification_objective.py index 1b4254af22..5c009e266a 100644 --- a/evalml/tests/objective_tests/test_binary_classification_objective.py +++ b/evalml/tests/objective_tests/test_binary_classification_objective.py @@ -74,7 +74,7 @@ def assign_problem_type(self): @abstractmethod def assign_objective(self, **kwargs): - """Get objective object using specified parameters.""" + """Get objective object using specified parameters""" def run_pipeline(self, X_y_binary, **kwargs): self.X, self.y = X_y_binary @@ -95,7 +95,7 @@ def run_pipeline(self, X_y_binary, **kwargs): @abstractmethod def test_score(self, y_true, y_predicted, expected_score): - """Objective score matches expected score. + """Objective score matches expected score Args: y_true (pd.Series): true classes @@ -105,7 +105,7 @@ def test_score(self, y_true, y_predicted, expected_score): @abstractmethod def test_all_base_tests(self): - """Run all relevant tests from the base class.""" + """Run all relevant tests from the base class""" @pytest.fixture(scope="class") def fix_y_pred_na(self): diff --git a/evalml/tests/objective_tests/test_cost_benefit_matrix.py b/evalml/tests/objective_tests/test_cost_benefit_matrix.py index 7ce5eeff44..04a15452d6 100644 --- a/evalml/tests/objective_tests/test_cost_benefit_matrix.py +++ b/evalml/tests/objective_tests/test_cost_benefit_matrix.py @@ -8,25 +8,25 @@ def test_cbm_init(): with pytest.raises( - ValueError, match="Args: to CostBenefitMatrix must all be numeric values." + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=None, true_negative=-1, false_positive=-7, false_negative=-2 ) with pytest.raises( - ValueError, match="Args: to CostBenefitMatrix must all be numeric values." + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=1, true_negative=-1, false_positive=None, false_negative=-2 ) with pytest.raises( - ValueError, match="Args: to CostBenefitMatrix must all be numeric values." + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=1, true_negative=None, false_positive=-7, false_negative=-2 ) with pytest.raises( - ValueError, match="Args: to CostBenefitMatrix must all be numeric values." + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." ): CostBenefitMatrix( true_positive=3, true_negative=-1, false_positive=-7, false_negative=None diff --git a/evalml/tests/objective_tests/test_objectives.py b/evalml/tests/objective_tests/test_objectives.py index 136c1ea119..b8db97c885 100644 --- a/evalml/tests/objective_tests/test_objectives.py +++ b/evalml/tests/objective_tests/test_objectives.py @@ -23,7 +23,7 @@ def test_create_custom_objective(): class MockEmptyObjective(ObjectiveBase): def objective_function(self, y_true, y_predicted, X=None): - """Docstring for mock objective function.""" + """Docstring for mock objective function""" with pytest.raises(TypeError): MockEmptyObjective() diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py index a6a034ef72..319a26ffbf 100644 --- a/evalml/tests/pipeline_tests/test_component_graph.py +++ b/evalml/tests/pipeline_tests/test_component_graph.py @@ -54,15 +54,15 @@ def fit(self, X, y): class TransformerA(DummyTransformer): - """copy class.""" + """copy class""" class TransformerB(DummyTransformer): - """copy class.""" + """copy class""" class TransformerC(DummyTransformer): - """copy class.""" + """copy class""" class DummyEstimator(Estimator): @@ -81,15 +81,15 @@ def fit(self, X, y): class EstimatorA(DummyEstimator): - """copy class.""" + """copy class""" class EstimatorB(DummyEstimator): - """copy class.""" + """copy class""" class EstimatorC(DummyEstimator): - """copy class.""" + """copy class""" @pytest.fixture diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py index 1fab9927c2..02f9f5b4bd 100644 --- a/evalml/tests/utils_tests/test_woodwork_utils.py +++ b/evalml/tests/utils_tests/test_woodwork_utils.py @@ -280,10 +280,10 @@ def test_ordinal_retains_order_min(): ), ) def test_infer_feature_types_NA_to_nan(null_col, already_inited): - """A short test to make sure that columns with all null values get converted from woodwork Unknown logical type with string physical type back to the original Double logical type with float physical type. - - Other Unknown columns should remain unchanged. - """ + """A short test to make sure that columns with all null values + get converted from woodwork Unknown logical type with string + physical type back to the original Double logical type with + float physical type. Other Unknown columns should remain unchanged.""" df = pd.DataFrame( { From 70572a084fa809f1ecd342ef45d5af3c31ec59a2 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 23:18:59 -0400 Subject: [PATCH 19/62] clean up objectives --- docs/source/user_guide/components.ipynb | 8 +-- docs/source/user_guide/data_checks.ipynb | 2 +- docs/source/user_guide/objectives.ipynb | 6 +- docs/source/user_guide/pipelines.ipynb | 4 +- evalml/automl/automl_search.py | 2 +- .../binary_classification_objective.py | 2 + evalml/objectives/fraud_cost.py | 1 + evalml/objectives/lead_scoring.py | 1 + .../multiclass_classification_objective.py | 1 + evalml/objectives/objective_base.py | 1 + evalml/objectives/regression_objective.py | 1 + evalml/objectives/standard_metrics.py | 61 +++++++++++++------ .../time_series_regression_objective.py | 1 + evalml/pipelines/component_graph.py | 8 +-- evalml/tests/conftest.py | 6 +- 15 files changed, 69 insertions(+), 36 deletions(-) diff --git a/docs/source/user_guide/components.ipynb b/docs/source/user_guide/components.ipynb index 15ab3bdd40..4641e0d608 100644 --- a/docs/source/user_guide/components.ipynb +++ b/docs/source/user_guide/components.ipynb @@ -162,7 +162,7 @@ " def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs):\n", " \"\"\"Initalizes an transformer to drop features whose percentage of NaN values exceeds a specified threshold.\n", "\n", - " Arguments:\n", + " Args:\n", " pct_null_threshold(float): The percentage of NaN values in an input feature to drop.\n", " Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values.\n", " If equal to 1.0, will drop columns with all null values. Defaults to 0.95.\n", @@ -180,7 +180,7 @@ " def fit(self, X, y=None):\n", " \"\"\"Fits DropNullColumns component to data\n", "\n", - " Arguments:\n", + " Args:\n", " X (pd.DataFrame): The input training data of shape [n_samples, n_features]\n", " y (pd.Series, optional): The target training data of length [n_samples]\n", "\n", @@ -200,7 +200,7 @@ " def transform(self, X, y=None):\n", " \"\"\"Transforms data X by dropping columns that exceed the threshold of null values.\n", "\n", - " Arguments:\n", + " Args:\n", " X (pd.DataFrame): Data to transform\n", " y (pd.Series, optional): Ignored.\n", "\n", @@ -272,7 +272,7 @@ " def __init__(self, strategy=\"mean\", random_seed=0, **kwargs):\n", " \"\"\"Baseline regressor that uses a simple strategy to make predictions.\n", "\n", - " Arguments:\n", + " Args:\n", " strategy (str): Method used to predict. Valid options are \"mean\", \"median\". Defaults to \"mean\".\n", " random_seed (int): Seed for the random number generator. Defaults to 0.\n", "\n", diff --git a/docs/source/user_guide/data_checks.ipynb b/docs/source/user_guide/data_checks.ipynb index cbefdd3f43..2694040ec6 100644 --- a/docs/source/user_guide/data_checks.ipynb +++ b/docs/source/user_guide/data_checks.ipynb @@ -628,7 +628,7 @@ " def __init__(self, problem_type, objective):\n", " \"\"\"\n", " A collection of basic data checks.\n", - " Arguments:\n", + " Args:\n", " problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass.\n", " \"\"\"\n", " if handle_problem_types(problem_type) == ProblemTypes.REGRESSION:\n", diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb index c78ed15f02..2e91c58fd2 100644 --- a/docs/source/user_guide/objectives.ipynb +++ b/docs/source/user_guide/objectives.ipynb @@ -141,7 +141,7 @@ " fraud_payout_percentage=1.0, amount_col='amount'):\n", " \"\"\"Create instance of FraudCost\n", "\n", - " Arguments:\n", + " Args:\n", " retry_percentage (float): What percentage of customers that will retry a transaction if it\n", " is declined. Between 0 and 1. Defaults to .5\n", "\n", @@ -161,7 +161,7 @@ " def decision_function(self, ypred_proba, threshold=0.0, X=None):\n", " \"\"\"Determine if a transaction is fraud given predicted probabilities, threshold, and dataframe with transaction amount\n", "\n", - " Arguments:\n", + " Args:\n", " ypred_proba (pd.Series): Predicted probablities\n", " X (pd.DataFrame): Dataframe containing transaction amount\n", " threshold (float): Dollar threshold to determine if transaction is fraud\n", @@ -181,7 +181,7 @@ " def objective_function(self, y_true, y_predicted, X):\n", " \"\"\"Calculate amount lost to fraud per transaction given predictions, true values, and dataframe with transaction amount\n", "\n", - " Arguments:\n", + " Args:\n", " y_predicted (pd.Series): predicted fraud labels\n", " y_true (pd.Series): true fraud labels\n", " X (pd.DataFrame): dataframe with transaction amounts\n", diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index af134fc1ab..cec3d9ef22 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -398,7 +398,7 @@ " def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs):\n", " \"\"\"Initalizes an transformer to drop features whose percentage of NaN values exceeds a specified threshold.\n", "\n", - " Arguments:\n", + " Args:\n", " pct_null_threshold(float): The percentage of NaN values in an input feature to drop.\n", " Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values.\n", " If equal to 1.0, will drop columns with all null values. Defaults to 0.95.\n", @@ -426,7 +426,7 @@ "\n", " def transform(self, X, y=None):\n", " \"\"\"Transforms data X by dropping columns that exceed the threshold of null values.\n", - " Arguments:\n", + " Args:\n", " X (pd.DataFrame): Data to transform\n", " y (pd.Series, optional): Targets\n", " Returns:\n", diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index a9f7e00496..6fb79d9a7c 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -85,7 +85,7 @@ def search( This method is provided for convenience. If you'd like more control over when each of these steps is run, consider making calls directly to the various pieces like the data checks and AutoMLSearch, instead of using this method. - Arguments: + Args: X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index 608ae037c9..7e0ce67be5 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -1,3 +1,4 @@ +"""Base class for all binary classification objectives.""" import numpy as np from scipy.optimize import minimize_scalar @@ -69,6 +70,7 @@ def decision_function(self, ypred_proba, threshold=0.5, X=None): return ypred_proba > threshold def validate_inputs(self, y_true, y_predicted): + """Validate inputs for scoring.""" super().validate_inputs(y_true, y_predicted) if len(np.unique(y_true)) > 2: raise ValueError("y_true contains more than two unique values") diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 975ddff3c3..29db2d76ab 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -1,3 +1,4 @@ +"""Score the percentage of money lost of the total transaction amount process due to fraud.""" from .binary_classification_objective import BinaryClassificationObjective diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py index 3f10edf5e1..1cd05f9fab 100644 --- a/evalml/objectives/lead_scoring.py +++ b/evalml/objectives/lead_scoring.py @@ -1,3 +1,4 @@ +"""Lead scoring objective.""" import math from .binary_classification_objective import BinaryClassificationObjective diff --git a/evalml/objectives/multiclass_classification_objective.py b/evalml/objectives/multiclass_classification_objective.py index cb76a9520a..835af0fc6a 100644 --- a/evalml/objectives/multiclass_classification_objective.py +++ b/evalml/objectives/multiclass_classification_objective.py @@ -1,3 +1,4 @@ +"""Base class for all multiclass classification objectives.""" from .objective_base import ObjectiveBase from evalml.problem_types import ProblemTypes diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 8a0e0ac16e..41ee61f637 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -196,4 +196,5 @@ def calculate_percent_difference(cls, score, baseline_score): @classmethod def is_defined_for_problem_type(cls, problem_type): + """Returns whether or not an objective is defined for a problem type.""" return handle_problem_types(problem_type) in cls.problem_types diff --git a/evalml/objectives/regression_objective.py b/evalml/objectives/regression_objective.py index 69d9b3d116..f4d896e7fc 100644 --- a/evalml/objectives/regression_objective.py +++ b/evalml/objectives/regression_objective.py @@ -1,3 +1,4 @@ +"""Base class for all regression objectives.""" from .objective_base import ObjectiveBase from evalml.problem_types import ProblemTypes diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py index 095b68d494..b72a4c4ad2 100644 --- a/evalml/objectives/standard_metrics.py +++ b/evalml/objectives/standard_metrics.py @@ -1,3 +1,4 @@ +"""Standard machine learning objective functions.""" import warnings import numpy as np @@ -25,6 +26,7 @@ class AccuracyBinary(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for accuracy score for binary classification.""" return metrics.accuracy_score(y_true, y_predicted, sample_weight=sample_weight) @@ -39,6 +41,7 @@ class AccuracyMulticlass(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for accuracy score for multiclass classification.""" return metrics.accuracy_score(y_true, y_predicted, sample_weight=sample_weight) @@ -53,6 +56,7 @@ class BalancedAccuracyBinary(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for accuracy score for balanced accuracy for binary classification.""" return metrics.balanced_accuracy_score( y_true, y_predicted, sample_weight=sample_weight ) @@ -69,6 +73,7 @@ class BalancedAccuracyMulticlass(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for accuracy score for balanced accuracy for multiclass classification.""" return metrics.balanced_accuracy_score( y_true, y_predicted, sample_weight=sample_weight ) @@ -85,6 +90,7 @@ class F1(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for F1 score for binary classification.""" return metrics.f1_score( y_true, y_predicted, zero_division=0.0, sample_weight=sample_weight ) @@ -101,6 +107,7 @@ class F1Micro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for F1 score for multiclass classification.""" return metrics.f1_score( y_true, y_predicted, @@ -121,6 +128,7 @@ class F1Macro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for F1 score for multiclass classification using macro averaging.""" return metrics.f1_score( y_true, y_predicted, @@ -141,6 +149,7 @@ class F1Weighted(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for F1 score for multiclass classification using weighted averaging.""" return metrics.f1_score( y_true, y_predicted, @@ -161,6 +170,7 @@ class Precision(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for precision score for binary classification.""" return metrics.precision_score( y_true, y_predicted, zero_division=0.0, sample_weight=sample_weight ) @@ -177,6 +187,7 @@ class PrecisionMicro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for precision score for binary classification using micro-averaging.""" return metrics.precision_score( y_true, y_predicted, @@ -187,7 +198,7 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): class PrecisionMacro(MulticlassClassificationObjective): - """Precision score for multiclass classification using macro averaging.""" + """Precision score for multiclass classification using macro-averaging.""" name = "Precision Macro" greater_is_better = True @@ -197,6 +208,7 @@ class PrecisionMacro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for precision score for multiclass classification using macro-averaging.""" return metrics.precision_score( y_true, y_predicted, @@ -217,6 +229,7 @@ class PrecisionWeighted(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for precision score for multiclass classification using weighted averaging.""" return metrics.precision_score( y_true, y_predicted, @@ -237,6 +250,7 @@ class Recall(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for recall score for binary classification.""" return metrics.recall_score( y_true, y_predicted, zero_division=0.0, sample_weight=sample_weight ) @@ -253,6 +267,7 @@ class RecallMicro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for recall score for multiclass classification using micro-averaging.""" return metrics.recall_score( y_true, y_predicted, @@ -273,6 +288,7 @@ class RecallMacro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for recall score for multiclass classification using macro-averaging.""" return metrics.recall_score( y_true, y_predicted, @@ -293,6 +309,7 @@ class RecallWeighted(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for recall score for multiclass classification using weighted averaging.""" return metrics.recall_score( y_true, y_predicted, @@ -313,6 +330,7 @@ class AUC(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for AUC score for binary classification.""" return metrics.roc_auc_score(y_true, y_predicted, sample_weight=sample_weight) @@ -327,6 +345,7 @@ class AUCMicro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for AUC score for multiclass classification using micro-averaging.""" y_true, y_predicted = _handle_predictions(y_true, y_predicted) return metrics.roc_auc_score( y_true, y_predicted, average="micro", sample_weight=sample_weight @@ -344,6 +363,7 @@ class AUCMacro(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for AUC score for multiclass classification using macro-averaging.""" y_true, y_predicted = _handle_predictions(y_true, y_predicted) return metrics.roc_auc_score( y_true, y_predicted, average="macro", sample_weight=sample_weight @@ -361,6 +381,7 @@ class AUCWeighted(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for AUC Score for multiclass classification using weighted averaging.""" y_true, y_predicted = _handle_predictions(y_true, y_predicted) return metrics.roc_auc_score( y_true, y_predicted, average="weighted", sample_weight=sample_weight @@ -378,6 +399,7 @@ class Gini(BinaryClassificationObjective): expected_range = [-1, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for Gini coefficient for binary classification.""" auc = metrics.roc_auc_score(y_true, y_predicted, sample_weight=sample_weight) return 2 * auc - 1 @@ -393,6 +415,7 @@ class LogLossBinary(BinaryClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for log loss for binary classification.""" return metrics.log_loss(y_true, y_predicted, sample_weight=sample_weight) @@ -407,6 +430,7 @@ class LogLossMulticlass(MulticlassClassificationObjective): expected_range = [0, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for log loss for multiclass classification.""" return metrics.log_loss(y_true, y_predicted, sample_weight=sample_weight) @@ -421,6 +445,7 @@ class MCCBinary(BinaryClassificationObjective): expected_range = [-1, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for Matthews correlation coefficient for binary classification.""" with warnings.catch_warnings(): # catches runtime warning when dividing by 0.0 warnings.simplefilter("ignore", RuntimeWarning) @@ -440,6 +465,7 @@ class MCCMulticlass(MulticlassClassificationObjective): expected_range = [-1, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for Matthews correlation coefficient for multiclass classification.""" with warnings.catch_warnings(): # catches runtime warning when dividing by 0.0 warnings.simplefilter("ignore", RuntimeWarning) @@ -459,6 +485,7 @@ class RootMeanSquaredError(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for root mean squared error for regression.""" return metrics.mean_squared_error( y_true, y_predicted, squared=False, sample_weight=sample_weight ) @@ -467,8 +494,7 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): class RootMeanSquaredLogError(RegressionObjective): """Root mean squared log error for regression. - Only valid for nonnegative inputs.Otherwise, will throw a - ValueError. + Only valid for nonnegative inputs. Otherwise, will throw a ValueError. """ name = "Root Mean Squared Log Error" @@ -479,6 +505,7 @@ class RootMeanSquaredLogError(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for root mean squared log error for regression.""" return np.sqrt( metrics.mean_squared_log_error( y_true, y_predicted, sample_weight=sample_weight @@ -487,18 +514,14 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. - - Default False. - """ + """If True, this objective is only valid for positive data.""" return True class MeanSquaredLogError(RegressionObjective): """Mean squared log error for regression. - Only valid for nonnegative inputs. Otherwise, will throw a - ValueError + Only valid for nonnegative inputs. Otherwise, will throw a ValueError. """ name = "Mean Squared Log Error" @@ -509,16 +532,14 @@ class MeanSquaredLogError(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for mean squared log error for regression.""" return metrics.mean_squared_log_error( y_true, y_predicted, sample_weight=sample_weight ) @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. - - Default False. - """ + """If True, this objective is only valid for positive data.""" return True @@ -533,6 +554,7 @@ class R2(RegressionObjective): expected_range = [-1, 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for coefficient of determination for regression.""" return metrics.r2_score(y_true, y_predicted, sample_weight=sample_weight) @@ -547,6 +569,7 @@ class MAE(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for mean absolute error for regression.""" return metrics.mean_absolute_error( y_true, y_predicted, sample_weight=sample_weight ) @@ -555,7 +578,7 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): class MAPE(TimeSeriesRegressionObjective): """Mean absolute percentage error for time series regression. Scaled by 100 to return a percentage. - Only valid for nonzero inputs. Otherwise, will throw a ValueError + Only valid for nonzero inputs. Otherwise, will throw a ValueError. """ name = "Mean Absolute Percentage Error" @@ -566,6 +589,7 @@ class MAPE(TimeSeriesRegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for mean absolute percentage error for time series regression.""" if (y_true == 0).any(): raise ValueError( "Mean Absolute Percentage Error cannot be used when " @@ -580,10 +604,7 @@ def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(self): - """If True, this objective is only valid for positive data. - - Default False. - """ + """If True, this objective is only valid for positive data.""" return True @@ -598,6 +619,7 @@ class MSE(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for mean squared error for regression.""" return metrics.mean_squared_error( y_true, y_predicted, sample_weight=sample_weight ) @@ -614,6 +636,7 @@ class MedianAE(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for median absolute error for regression.""" return metrics.median_absolute_error( y_true, y_predicted, sample_weight=sample_weight ) @@ -630,6 +653,7 @@ class MaxError(RegressionObjective): expected_range = [0, float("inf")] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for maximum residual error for regression.""" return metrics.max_error(y_true, y_predicted) @@ -644,6 +668,7 @@ class ExpVariance(RegressionObjective): expected_range = [float("-inf"), 1] def objective_function(self, y_true, y_predicted, X=None, sample_weight=None): + """Objective function for explained variance score for regression.""" return metrics.explained_variance_score( y_true, y_predicted, sample_weight=sample_weight ) diff --git a/evalml/objectives/time_series_regression_objective.py b/evalml/objectives/time_series_regression_objective.py index 5d9229aa56..bb73dc3840 100644 --- a/evalml/objectives/time_series_regression_objective.py +++ b/evalml/objectives/time_series_regression_objective.py @@ -1,3 +1,4 @@ +"""Base class for all time series regression objectives.""" from .regression_objective import RegressionObjective from evalml.problem_types import ProblemTypes diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 3d926851dd..0578c5aa71 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -587,11 +587,11 @@ def graph(self, name=None, graph_format=None): @staticmethod def _get_edges(component_dict, edges_to_return="all"): - """ - Gets the edges for a component graph. + """Gets the edges for a component graph. - Arguments: - edges (str): The types of edges to return. Defaults to "all". + Args: + component_dict (dict): Component dictionary to get edges from. + edges_to_return (str): The types of edges to return. Defaults to "all". - if "all", returns all types of edges. - if "features", returns only feature edges - if "target", returns only target edges diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index bf90f867d3..3e2a1c2c10 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1046,7 +1046,7 @@ def _imbalanced_data_X_y(problem_type, categorical_columns, size): For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. - Arguments: + Args: problem_type (str): Either 'binary' or 'multiclass' categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 @@ -1106,7 +1106,7 @@ class _AutoMLTestEnv: def __init__(self, problem_type): """Create a test environment. - Arguments: + Args: problem_type (str): The problem type corresponding to the search class you want to test. Attributes: @@ -1210,7 +1210,7 @@ def test_context( """A context manager for creating an environment that patches time-consuming pipeline methods. Sets the mock_fit, mock_score, mock_encode_targets, mock_predict_proba, mock_optimize_threshold attributes. - Arguments: + Args: score_return_value: Passed as the return_value argument of the pipeline.score patch. mock_score_side_effect: Passed as the side_effect argument of the pipeline.score patch. Takes precedence over score_return_value. From f7bfb53926bdb3f147b06eb226a14856fa6a7f3b Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 30 Aug 2021 23:30:23 -0400 Subject: [PATCH 20/62] release notes update --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index f63ab11f06..825ac2dfcf 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -15,7 +15,7 @@ Release Notes * Replaced ``SMOTEOversampler``, ``SMOTENOversampler`` and ``SMOTENCOversampler`` with consolidated ``Oversampler`` component :pr:`2695` * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` * Documentation Changes - * Added docstring formatting linting :pr:`2670` + * Added docstring linting package ``pydocstyle`` and rule to `make-lint` command :pr:`2670` * Testing Changes * Removes the process-level parallelism from the ``test_cancel_job`` test :pr:`2666` * Installed numba 0.53 in windows CI to prevent problems installing version 0.54 :pr:`2710` From ba685e3439683fbc01b07052148b3a674add5723 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 00:09:31 -0400 Subject: [PATCH 21/62] fix tests --- evalml/data_checks/data_checks.py | 2 +- .../model_understanding/prediction_explanations/explainers.py | 2 +- evalml/objectives/cost_benefit_matrix.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index cc6f3c51d1..172aed85c5 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -68,7 +68,7 @@ def _init_data_checks(data_check_classes, params): class_params = params.get(data_check_class.name, {}) if not isinstance(class_params, dict): raise DataCheckInitError( - f"Parameters: for {data_check_class.name} were not in a dictionary. Received {class_params}." + f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}." ) try: data_check_instances.append(data_check_class(**class_params)) diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index 818e9a49ab..1f0cf85659 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -187,7 +187,7 @@ def explain_predictions_best_worst( ) if y_true.shape[0] != input_features.shape[0]: raise ValueError( - "Args: y_true and input_features must have the same number of data points. Received: " + "Parameters y_true and input_features must have the same number of data points. Received: " f"true labels: {y_true.shape[0]} and {input_features.shape[0]}" ) if output_format not in {"text", "dict", "dataframe"}: diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index 00c44238f5..1f54ba4fa6 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -25,7 +25,7 @@ class CostBenefitMatrix(BinaryClassificationObjective): def __init__(self, true_positive, true_negative, false_positive, false_negative): if None in {true_positive, true_negative, false_positive, false_negative}: - raise ValueError("Args: to CostBenefitMatrix must all be numeric values.") + raise ValueError("Parameters to CostBenefitMatrix must all be numeric values.") self.true_positive = true_positive self.true_negative = true_negative From 4d302d02722b13231658f9e5fe95e71e960ee51c Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 08:55:08 -0400 Subject: [PATCH 22/62] fix dependencies --- evalml/objectives/cost_benefit_matrix.py | 4 +++- .../dependency_update_check/latest_dependency_versions.txt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index 1f54ba4fa6..4640d76d22 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -25,7 +25,9 @@ class CostBenefitMatrix(BinaryClassificationObjective): def __init__(self, true_positive, true_negative, false_positive, false_negative): if None in {true_positive, true_negative, false_positive, false_negative}: - raise ValueError("Parameters to CostBenefitMatrix must all be numeric values.") + raise ValueError( + "Parameters to CostBenefitMatrix must all be numeric values." + ) self.true_positive = true_positive self.true_negative = true_negative diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index f86d86405c..6483d54792 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -16,7 +16,7 @@ nlp-primitives==1.1.0 numba==0.53.0 numpy==1.21.2 pandas==1.3.2 -plotly==5.3.0 +plotly==5.0.0 pmdarima==1.8.0 psutil==5.8.0 pyzmq==22.2.1 From c4520337776051af74ffe04478718296644f532f Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 13:18:24 -0400 Subject: [PATCH 23/62] cleanup --- .../automl_algorithm/automl_algorithm.py | 2 +- evalml/automl/automl_search.py | 25 ++++++++++--------- evalml/automl/callbacks.py | 2 +- evalml/automl/engine/cf_engine.py | 2 +- evalml/automl/engine/dask_engine.py | 9 ++++--- evalml/automl/engine/engine_base.py | 7 ++---- evalml/automl/utils.py | 8 +++--- evalml/data_checks/data_check_action.py | 2 +- evalml/data_checks/data_check_message.py | 2 +- evalml/data_checks/data_checks.py | 2 +- .../data_checks/datetime_format_data_check.py | 2 +- evalml/data_checks/id_columns_data_check.py | 4 ++- .../data_checks/invalid_targets_data_check.py | 2 +- .../natural_language_nan_data_check.py | 4 +-- .../data_checks/target_leakage_data_check.py | 1 - evalml/demos/diabetes.py | 2 +- evalml/exceptions/__init__.py | 2 +- evalml/exceptions/exceptions.py | 2 +- evalml/model_understanding/graphs.py | 2 +- .../prediction_explanations/_algorithms.py | 2 +- evalml/objectives/objective_base.py | 2 +- evalml/objectives/utils.py | 8 +++--- evalml/pipelines/component_graph.py | 2 +- evalml/pipelines/components/component_base.py | 2 +- .../ensemble/sklearn_stacked_ensemble_base.py | 4 +-- .../classifiers/baseline_classifier.py | 4 +-- .../components/estimators/estimator.py | 4 +-- 27 files changed, 54 insertions(+), 56 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 2013973cf7..ae1b6a7143 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -53,7 +53,7 @@ def __init__( def next_batch(self): """Get the next batch of pipelines to evaluate. - Returns + Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 6fb79d9a7c..ec69bb601e 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -116,7 +116,7 @@ def search( in time series problems, values should be passed in for the date_index, gap, and max_delay variables. Returns: - (AutoMLSearch, dict): the automl search object containing pipelines and rankings, and the results from running the data checks. If the data check results contain errors, automl search will not be run and an automl search object will not be returned. + (AutoMLSearch, dict): The automl search object containing pipelines and rankings, and the results from running the data checks. If the data check results contain errors, automl search will not be run and an automl search object will not be returned. """ X_train = infer_feature_types(X_train) y_train = infer_feature_types(y_train) @@ -328,7 +328,7 @@ class AutoMLSearch: ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. If the number of unique pipelines to search over per batch is one, ensembling will not run. Defaults to False. - max_batches (int): The maximum number of batches of pipelines to search. Args: max_time, and + max_batches (int): The maximum number of batches of pipelines to search. Parameters max_time, and max_iterations have precedence over stopping the search. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, @@ -840,7 +840,7 @@ def _get_funct_name(function): search_desc = ( f"{handle_problem_types(self.problem_type).name} Search\n\n" - f"Args:: \n{'='*20}\n" + f"Parameters: \n{'='*20}\n" f"Objective: {get_objective(self.objective).name}\n" f"Max Time: {self.max_time}\n" f"Max Iterations: {self.max_iterations}\n" @@ -882,8 +882,8 @@ def _validate_problem_configuration(self, problem_configuration=None): def _handle_keyboard_interrupt(self): """Presents a prompt to the user asking if they want to stop the search. - Returns - bool: If True, search should terminate early + Returns: + bool: If True, search should terminate early. """ leading_char = "\n" start_of_loop = time.time() @@ -1323,10 +1323,10 @@ def get_pipeline(self, pipeline_id): """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline initialized with the parameters used to train that pipeline during automl search. Args: - pipeline_id (int): pipeline to retrieve + pipeline_id (int): Pipeline to retrieve - Returns - PipelineBase: untrained pipeline instance associated with the provided ID + Returns: + PipelineBase: Untrained pipeline instance associated with the provided ID """ pipeline_results = self.results["pipeline_results"].get(pipeline_id) if pipeline_results is None: @@ -1347,7 +1347,7 @@ def describe_pipeline(self, pipeline_id, return_dict=False): return_dict (bool): If True, return dictionary of information about pipeline. Defaults to False. - Returns + Returns: Description of specified pipeline. Includes information such as type of pipeline components, problem, training time, cross validation, etc. """ @@ -1444,8 +1444,9 @@ def add_to_rankings(self, pipeline): def results(self): """Class that allows access to a copy of the results from `automl_search`. - Returns dict containing `pipeline_results`: a dict with results from each pipeline, - and `search_order`: a list describing the order the pipelines were searched. + Returns: + dict: Dictionary containing `pipeline_results`, a dict with results from each pipeline, + and `search_order`, a list describing the order the pipelines were searched. """ return copy.deepcopy(self._results) @@ -1600,7 +1601,7 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): objectives (list(str), list(ObjectiveBase)): Objectives used for scoring. Returns: - Dict[str, Dict[str, float]]: Dictionary keyed by pipeline name that maps to a dictionary of scores. + dict[str, Dict[str, float]]: Dictionary keyed by pipeline name that maps to a dictionary of scores. Note that the any pipelines that error out during scoring will not be included in the dictionary but the exception and stacktrace will be displayed in the log. """ diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index 3aa12c722b..d4b030869b 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -44,5 +44,5 @@ def log_error_callback(exception, traceback, automl, **kwargs): logger.info( f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}" ) - logger.debug(f"\t\t\tFold {fold_num}: Args::\n\t{pipeline.parameters}") + logger.debug(f"\t\t\tFold {fold_num}: Parameters:\n\t{pipeline.parameters}") logger.debug(f"\t\t\tFold {fold_num}: Traceback:\n{trace}") diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index 9b2b413c4e..54d7e95c0f 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -12,7 +12,7 @@ class CFClient: """Custom CFClient API to match Dask's CFClient and allow context management. Args: - pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): the resource pool to execute the futures work on. + pool(cf.ThreadPoolExecutor or cf.ProcessPoolExecutor): The resource pool to execute the futures work on. """ def __init__(self, pool): diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index 0a36856448..990adef418 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -62,10 +62,11 @@ def send_data_to_cluster(self, X, y): dask best practices. Args: - X (pd.DataFrame): input data for modeling - y (pd.Series): target data for modeling - Returns - dask.Future: the modeling data + X (pd.DataFrame): Input data for modeling + y (pd.Series): Target data for modeling + + Returns: + dask.Future: The modeling data """ data_hash = joblib.hash(X), joblib.hash(y) if data_hash in self._data_futures_cache: diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index c95c45b848..0e9acabaad 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -63,10 +63,7 @@ def error(self, msg): self.logs.append(("error", msg)) def write_to_logger(self, logger): - """Write all the messages to the logger. - - First In First Out order. - """ + """Write all the messages to the logger, first in, first out (FIFO) order.""" logger_method = { "info": logger.info, "debug": logger.debug, @@ -292,7 +289,7 @@ def train_and_score_pipeline( def evaluate_pipeline(pipeline, automl_config, X, y, logger): - """Submit this function to the submit_evaluation_job engine method. + """Function submitted to the submit_evaluation_job engine method. Args: pipeline (PipelineBase): The pipeline to score. diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 781cbc58f0..9c143c7f7a 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -176,7 +176,7 @@ def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): sampler_balanced_ratio (float): The ratio of min:majority targets that we would consider balanced, or should balance the classes to. - Returns + Returns: str, None: The string name of the sampling component to use, or None if no sampler is necessary """ # we check for the class balances @@ -207,10 +207,10 @@ def get_pipelines_from_component_graphs( Args: component_graphs_dict (dict): The dict of component graphs. problem_type (str or ProblemType): The problem type for which pipelines will be created. - parameters (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. - random_seed (int): Random seed. + parameters (dict): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. + random_seed (int): Random seed. Defaults to 0. - Returns + Returns: list: List of pipelines made from the passed component graphs. """ pipeline_class = { diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 778c83eed9..e0a26ce3e8 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -2,7 +2,7 @@ class DataCheckAction: - """Recommended action returned by a DataCheck. + """A recommended action returned by a DataCheck. Args: action_code (DataCheckActionCode): Action code associated with the action. diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index 7d7b884c7b..d7cc709300 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -21,7 +21,7 @@ def __init__(self, message, data_check_name, message_code=None, details=None): self.details = details def __str__(self): - """Return a string representation of data check message, equivalent to self.message attribute.""" + """String representation of data check message, equivalent to self.message attribute.""" return self.message def __eq__(self, other): diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 172aed85c5..6c1c0fddd8 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -26,7 +26,7 @@ class DataChecks: @staticmethod def _validate_data_checks(data_check_classes, params): - """Init a DataChecks instance from a list of DataCheck classes and corresponding params.""" + """Creates a DataChecks instance from a list of DataCheck classes and corresponding params.""" if not isinstance(data_check_classes, list): raise ValueError( f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}." diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 4626cd1988..5d4fc36d47 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -16,7 +16,7 @@ def __init__(self, datetime_column="index"): self.datetime_column = datetime_column def validate(self, X, y): - """Check if the target data has equal intervals and is sorted. + """Checks if the target data has equal intervals and is sorted. Args: X (pd.DataFrame, np.ndarray): Features. diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index b875428cec..1b344f78a8 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -24,12 +24,14 @@ def __init__(self, id_threshold=1.0): def validate(self, X, y=None): """Check if any of the features are likely to be ID columns. Currently performs a number of simple checks. + Checks performed are: + - column name is "id" - column name ends in "_id" - column contains all unique values (and is categorical / integer type) Args: - X (pd.DataFrame, np.ndarray): The input features to check.T + X (pd.DataFrame, np.ndarray): The input features to check. y (pd.Series): The target. Defaults to None. Ignored. Returns: diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 8f1c5e8ac0..113cbec3e7 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -50,7 +50,7 @@ def validate(self, X, y): X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for invalid values. - Returns + Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index be7cd2ee2e..c638f5adb6 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -15,11 +15,10 @@ def validate(self, X, y=None): X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. - Returns + Returns: dict: dict with a DataCheckError if NaN values are present in natural language columns. Example: - >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np @@ -37,7 +36,6 @@ def validate(self, X, y=None): ... message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, ... details={"columns": 'A'}).to_dict()] ... } - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index ce82e07b6b..ef2e7567f3 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -98,7 +98,6 @@ def validate(self, X, y): ... "errors": [], ... "actions": [{"code": "DROP_COL", ... "metadata": {"column": "leak"}}]} - """ results = {"warnings": [], "errors": [], "actions": []} diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index 4ee5ac9971..9642b28840 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -10,7 +10,7 @@ def load_diabetes(): """Load diabetes dataset. Used for regression problem. Returns: - pd.Dataframe, pd.Series): X and y + (pd.Dataframe, pd.Series): X and y """ filename = ( "https://api.featurelabs.com/datasets/diabetes.csv?library=evalml&version=" diff --git a/evalml/exceptions/__init__.py b/evalml/exceptions/__init__.py index edcf617321..39b8a0b77a 100644 --- a/evalml/exceptions/__init__.py +++ b/evalml/exceptions/__init__.py @@ -1,4 +1,4 @@ -"""Exception used in EvalML.""" +"""Exceptions used in EvalML.""" from .exceptions import ( MethodPropertyNotFoundError, PipelineNotFoundError, diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py index 72140fe75d..40b6975280 100644 --- a/evalml/exceptions/exceptions.py +++ b/evalml/exceptions/exceptions.py @@ -99,7 +99,7 @@ class ParameterNotUsedWarning(UserWarning): def __init__(self, components): self.components = components - msg = f"Args: for components {components} will not be used to instantiate the pipeline since they don't appear in the pipeline" + msg = f"Parameters for components {components} will not be used to instantiate the pipeline since they don't appear in the pipeline" super().__init__(msg) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index c045b27aa2..b19e1b7e6b 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -949,7 +949,7 @@ def graph_partial_dependence( (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. - Returns:: + Returns: plotly.graph_objects.Figure: figure object containing the partial dependence data for plotting Raises: diff --git a/evalml/model_understanding/prediction_explanations/_algorithms.py b/evalml/model_understanding/prediction_explanations/_algorithms.py index 9e71afee2d..5dbf59fd1f 100644 --- a/evalml/model_understanding/prediction_explanations/_algorithms.py +++ b/evalml/model_understanding/prediction_explanations/_algorithms.py @@ -204,7 +204,7 @@ def _normalize_values_dict(values): Returns: dict - Example: + Examples: >>> values = {"a": [1, -1, 3], "b": [3, -2, 0], "c": [-1, 3, 4]} >>> normalized_values = _normalize_values_dict(values) >>> assert normalized_values == {"a": [1/5, -1/6, 3/7], "b": [3/5, -2/6, 0/7], "c": [-1/5, 3/6, 4/7]} diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 41ee61f637..0db65afca5 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -78,7 +78,7 @@ def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): def positive_only(cls): """If True, this objective is only valid for positive data. - Default False. + Defaults to False. """ return False diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index cc49f5c78f..14cc0f6967 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -13,7 +13,7 @@ def get_non_core_objectives(): Non-core objectives are objectives that are domain-specific. Users typically need to configure these objectives before using them in AutoMLSearch. - Returns:: + Returns: List of ObjectiveBase classes """ return [ @@ -44,7 +44,7 @@ def _all_objectives_dict(): def get_all_objective_names(): """Get a list of the names of all objectives. - Returns:: + Returns: list (str): Objective names """ all_objectives_dict = _all_objectives_dict() @@ -67,7 +67,7 @@ def get_core_objective_names(): def get_objective(objective, return_instance=False, **kwargs): - """Returns: the Objective class corresponding to a given objective name. + """Returns the Objective class corresponding to a given objective name. Args: objective (str or ObjectiveBase): Name or instance of the objective class. @@ -110,7 +110,7 @@ def get_objective(objective, return_instance=False, **kwargs): def get_core_objectives(problem_type): - """Returns: all core objective instances associated with the given problem type. + """Returns all core objective instances associated with the given problem type. Core objectives are designed to work out-of-the-box for any dataset. diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 0578c5aa71..d202202a51 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -487,7 +487,7 @@ def get_inputs(self, component_name): Args: component_name (str): Name of the component to look up. - Returns + Returns: list[str]: List of inputs for the component to use. """ try: diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index c282cac5a0..decf70fe36 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -179,7 +179,7 @@ def load(file_path): Args: file_path (str): Location to load file - Returns + Returns: ComponentBase object """ with open(file_path, "rb") as f: diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index f689e7a601..79414eb9d6 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -104,8 +104,8 @@ def feature_importance(self): def default_parameters(cls): """Returns the default parameters for stacked ensemble classes. - Returns - dict: default parameters for this component. + Returns: + dict: Default parameters for this component. """ return { "final_estimator": None, diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index a18ec686d1..5ff42a4a33 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -122,7 +122,7 @@ def predict_proba(self, X): def feature_importance(self): """Returns importance associated with each feature. Since baseline classifiers do not use input features to calculate predictions, returns an array of zeroes. - Returns + Returns: np.ndarray (float): An array of zeroes """ return np.zeros(self._num_features) @@ -131,7 +131,7 @@ def feature_importance(self): def classes_(self): """Returns class labels. Will return None before fitting. - Returns + Returns: list[str] or list(float) : Class names """ return self._classes diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index d1e377c21f..11ae564166 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -114,8 +114,8 @@ def predict_proba(self, X): def feature_importance(self): """Returns importance associated with each feature. - Returns - np.ndarray: Importance associated with each feature + Returns: + np.ndarray: Importance associated with each feature. """ try: return self._component_obj.feature_importances_ From 0b37cfa275ee223aaa86a1295995dc03e00a332d Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 13:31:56 -0400 Subject: [PATCH 24/62] more cleanup --- .../time_series_baseline_estimator.py | 4 ++-- .../transformers/encoders/target_encoder.py | 4 ++-- .../latest_dependency_versions.txt | 2 +- evalml/utils/cli_utils.py | 18 +++--------------- evalml/utils/gen_utils.py | 2 +- evalml/utils/woodwork_utils.py | 6 +++--- requirements.txt | 2 +- 7 files changed, 13 insertions(+), 25 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index b7e3ffd833..49a05afc79 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -115,7 +115,7 @@ def feature_importance(self): Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes. - Returns - np.ndarray (float): an array of zeroes + Returns: + np.ndarray (float): An array of zeroes. """ return np.zeros(self._num_features) diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index 6abfcdf94e..39c76ae950 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -122,8 +122,8 @@ def fit_transform(self, X, y): def get_feature_names(self): """Return feature names for the input features after fitting. - Returns - np.array: The feature names after encoding + Returns: + np.array: The feature names after encoding. """ return self._component_obj.get_feature_names() diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 6483d54792..f86d86405c 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -16,7 +16,7 @@ nlp-primitives==1.1.0 numba==0.53.0 numpy==1.21.2 pandas==1.3.2 -plotly==5.0.0 +plotly==5.3.0 pmdarima==1.8.0 psutil==5.8.0 pyzmq==22.2.1 diff --git a/evalml/utils/cli_utils.py b/evalml/utils/cli_utils.py index 56a1e45dbd..c1dee8bda3 100644 --- a/evalml/utils/cli_utils.py +++ b/evalml/utils/cli_utils.py @@ -16,11 +16,7 @@ def print_info(): - """Prints information about the system, evalml, and dependencies of evalml. - - Returns? - None - """ + """Prints information about the system, evalml, and dependencies of evalml.""" logger.info("EvalML version: %s" % evalml.__version__) logger.info("EvalML installation directory: %s" % get_evalml_root()) print_sys_info() @@ -28,11 +24,7 @@ def print_info(): def print_sys_info(): - """Prints system information. - - Returns: - None - """ + """Prints system information.""" logger.info("\nSYSTEM INFO") logger.info("-----------") sys_info = get_sys_info() @@ -41,11 +33,7 @@ def print_sys_info(): def print_deps(): - """Prints the version number of each dependency. - - Returns: - None - """ + """Prints the version number of each dependency.""" logger.info("\nINSTALLED VERSIONS") logger.info("------------------") installed_packages = get_installed_packages() diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 973109fcfd..7c99927771 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -251,7 +251,7 @@ def jupyter_check(): """Get whether or not the code is being run in a Ipython environment (such as Jupyter Notebook or Jupyter Lab). Returns: - Boolean: True if Ipython, False otherwise. + boolean: True if Ipython, False otherwise. """ try: ipy = import_or_raise("IPython") diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 5d679bdb97..68410fb40a 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -54,7 +54,7 @@ def infer_feature_types(data, feature_types=None): mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage") - Returns + Returns: A Woodwork data structure where the data type of each column was either specified or inferred. """ if isinstance(data, list): @@ -121,7 +121,7 @@ def _retain_custom_types_and_initalize_woodwork( ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataFrame that have a logical type specified in this list will not have their logical types carried over to the new DataFrame returned - Returns + Returns: A new DataFrame where any of the columns that exist in the old input DataFrame and the new DataFrame try to retain the original logical type, if possible and not specified to be ignored. """ @@ -156,7 +156,7 @@ def _convert_numeric_dataset_pandas(X, y): X (pd.DataFrame, np.ndarray): Data to transform y (pd.Series, np.ndarray): Target data - Returns + Returns: Tuple(pd.DataFrame, pd.Series): Transformed X and y """ X_ww = infer_feature_types(X) diff --git a/requirements.txt b/requirements.txt index 4130f5d7b8..d8c17d94d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -r core-requirements.txt -plotly==5.0.0 +plotly>=5.0.0 kaleido>=0.1.0 ipywidgets>=7.5 xgboost>=1.4.2 From 29bd5071327826f50f5e6233d213d4f8a6f4ac6d Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 16:20:16 -0400 Subject: [PATCH 25/62] yet more cleanup --- evalml/automl/engine/dask_engine.py | 12 ++++++------ evalml/data_checks/data_checks.py | 2 +- evalml/model_understanding/graphs.py | 2 +- .../feature_selection/feature_selector.py | 4 ++-- .../preprocessing/datetime_featurizer.py | 6 +++--- .../preprocessing/delayed_feature_transformer.py | 6 +++--- .../transformers/preprocessing/drop_null_columns.py | 2 +- .../transformers/preprocessing/featuretools.py | 6 +++--- evalml/pipelines/components/utils.py | 8 ++++---- evalml/pipelines/pipeline_base.py | 7 +++---- evalml/pipelines/time_series_pipeline_base.py | 2 +- evalml/problem_types/problem_types.py | 4 ++-- evalml/tuners/skopt_tuner.py | 8 ++++---- 13 files changed, 34 insertions(+), 35 deletions(-) diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index 990adef418..ddb8e29f6e 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -62,11 +62,11 @@ def send_data_to_cluster(self, X, y): dask best practices. Args: - X (pd.DataFrame): Input data for modeling - y (pd.Series): Target data for modeling + X (pd.DataFrame): Input data for modeling. + y (pd.Series): Target data for modeling. Returns: - dask.Future: The modeling data + dask.Future: The modeling data. """ data_hash = joblib.hash(X), joblib.hash(y) if data_hash in self._data_futures_cache: @@ -89,7 +89,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat Returns: DaskComputation: An object wrapping a reference to a future-like computation - occurring in the dask cluster + occurring in the dask cluster. """ logger = self.setup_job_log() X, y = self.send_data_to_cluster(X, y) @@ -114,7 +114,7 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio Returns: DaskComputation: An object wrapping a reference to a future-like computation - occurring in the dask cluster + occurring in the dask cluster. """ X, y = self.send_data_to_cluster(X, y) dask_future = self.client.submit( @@ -136,7 +136,7 @@ def submit_scoring_job( Returns: DaskComputation: An object wrapping a reference to a future-like computation - occurring in the dask cluster + occurring in the dask cluster. """ # Get the schema before we lose it X_schema = X.ww.schema diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 6c1c0fddd8..334beda2d7 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -26,7 +26,7 @@ class DataChecks: @staticmethod def _validate_data_checks(data_check_classes, params): - """Creates a DataChecks instance from a list of DataCheck classes and corresponding params.""" + """Creates a DataChecks instance from a list of DataCheck classes and corresponding params.""" if not isinstance(data_check_classes, list): raise ValueError( f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}." diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index b19e1b7e6b..fffbe5283d 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1165,7 +1165,7 @@ def _add_ice_plot(_go, fig, ice_data, label=None, row=None, col=None): def _calculate_axis_range(arr): - """A helper method to help calculate the appropriate range for an axis based on the data to graph.""" + """Helper method to help calculate the appropriate range for an axis based on the data to graph.""" max_value = arr.max() min_value = arr.min() margins = abs(max_value - min_value) * 0.05 diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index f13ab921a2..d1ebc53e67 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -21,8 +21,8 @@ class FeatureSelector(Transformer): def get_names(self): """Get names of selected features. - Returns - list[str]: List of the names of features selected + Returns: + list[str]: List of the names of features selected. """ selected_masks = self._component_obj.get_support() return [ diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index a64fa8ee63..434f04d4c9 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -133,7 +133,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Input features. y (pd.Series, optional): Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X = infer_feature_types(X) @@ -156,8 +156,8 @@ def transform(self, X, y=None): def get_feature_names(self): """Gets the categories of each datetime feature. - Returns - Dictionary, where each key-value pair is a column name and a dictionary + Returns: + dict: Dictionary, where each key-value pair is a column name and a dictionary mapping the unique feature values to their integer encoding. """ return self._categories diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index 1b24cd81d8..1955575bef 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -62,7 +62,7 @@ def fit(self, X, y=None): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] - Returns + Returns: self """ return self @@ -103,7 +103,7 @@ def transform(self, X, y=None): X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. - Returns + Returns: pd.DataFrame: Transformed X. """ if X is None: @@ -138,7 +138,7 @@ def fit_transform(self, X, y): X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. - Returns + Returns: pd.DataFrame: Transformed X. """ return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index ac441e453c..7102c30842 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -57,7 +57,7 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. - Returns + Returns: pd.DataFrame: Transformed X """ X_t = infer_feature_types(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 1140b7cbc1..937c26e7c0 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -48,10 +48,10 @@ def fit(self, X, y=None): """Fits the DFSTransformer Transformer component. Args: - X (pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features] - y (pd.Series): The target training data of length [n_samples] + X (pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. - Returns + Returns: self """ X_ww = infer_feature_types(X) diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 02e5bc7704..6b5d0abf84 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -40,7 +40,7 @@ def allowed_model_families(problem_type): problem_types (ProblemTypes or str): ProblemTypes enum or string. Returns: - list[ModelFamily]: A list of model families + list[ModelFamily]: A list of model families. """ estimators = [] problem_type = handle_problem_types(problem_type) @@ -167,8 +167,8 @@ def predict(self, X): Args: X (pd.DataFrame): Features - Returns - np.ndarray: Predicted values + Returns: + np.ndarray: Predicted values. """ check_is_fitted(self, "is_fitted_") @@ -302,7 +302,7 @@ def make_balancing_dictionary(y, sampling_ratio): sampling_ratio (float): The balanced ratio we want the samples to meet Returns: - dict : Dictionary where keys are the classes, and the corresponding values are the counts of samples + dict: Dictionary where keys are the classes, and the corresponding values are the counts of samples for each class that will satisfy sampling_ratio. """ if sampling_ratio <= 0 or sampling_ratio > 1: diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index dd0400abf9..6a4e25b2ab 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -131,8 +131,7 @@ def name(self): def summary(self): """A short summary of the pipeline structure, describing the list of components used. - Example: - Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder + Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ component_graph = [ type(self.component_graph.component_instances[component]) @@ -404,7 +403,7 @@ def feature_importance(self): """Importance associated with each feature. Features dropped by the feature selection are excluded. Returns: - pd.DataFrame : Feature names and their corresponding importance + pd.DataFrame: Feature names and their corresponding importance """ feature_names = self.input_feature_names[self._estimator_name] importance = list( @@ -476,7 +475,7 @@ def graph_feature_importance(self, importance_threshold=0): importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. Returns: - plotly.Figure : A bar graph showing features and their corresponding importance + plotly.Figure: A bar graph showing features and their corresponding importance """ go = import_or_raise( "plotly.graph_objects", diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 2c549ba015..5e5215530f 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -60,7 +60,7 @@ def fit(self, X, y): X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training targets of length [n_samples]. - Returns + Returns: self """ X, y = self._convert_to_woodwork(X, y) diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index b2d3555a61..fe17fd73c6 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -40,7 +40,7 @@ def _all_values(cls): def all_problem_types(cls): """Get a list of all defined problem types. - Returns - list(ProblemTypes): list + Returns: + list(ProblemTypes): List of all defined problem types. """ return list(cls) diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 3a1469964c..3310774963 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -36,7 +36,7 @@ def add(self, pipeline_parameters, score): pipeline_parameters (dict): A dict of the parameters used to evaluate a pipeline score (float): The score obtained by evaluating the pipeline with the provided parameters - Returns + Returns: None """ # skip adding nan scores @@ -47,7 +47,7 @@ def add(self, pipeline_parameters, score): self.opt.tell(flat_parameter_values, score) except Exception as e: logger.debug( - "SKOpt tuner received error during add. Score: {}\nArgs:: {}\nFlat parameter values: {}\nError: {}".format( + "SKOpt tuner received error during add. Score: {}\nParameters: {}\nFlat parameter values: {}\nError: {}".format( pipeline_parameters, score, flat_parameter_values, e ) ) @@ -62,8 +62,8 @@ def add(self, pipeline_parameters, score): def propose(self): """Returns a suggested set of parameters to train and score a pipeline with, based off the search space dimensions and prior samples. - Returns - dict: Proposed pipeline parameters + Returns: + dict: Proposed pipeline parameters. """ with warnings.catch_warnings(): warnings.simplefilter("ignore") From 43f55c3538ac36e80c3bd77e668c182d7d2f3646 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 31 Aug 2021 19:21:57 -0400 Subject: [PATCH 26/62] a final ounce of cleanup --- .../components/transformers/preprocessing/featuretools.py | 2 +- evalml/utils/woodwork_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 937c26e7c0..0925395e8e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -69,7 +69,7 @@ def transform(self, X, y=None): X (pd.DataFrame or np.ndarray): The input training data to transform. Has shape [n_samples, n_features] y (pd.Series, optional): Ignored. - Returns + Returns: pd.DataFrame: Feature matrix """ X_ww = infer_feature_types(X) diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 68410fb40a..7a7e4fb256 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -153,11 +153,11 @@ def _convert_numeric_dataset_pandas(X, y): """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. Used with data sampler strategies. Args: - X (pd.DataFrame, np.ndarray): Data to transform - y (pd.Series, np.ndarray): Target data + X (pd.DataFrame, np.ndarray): Data to transform. + y (pd.Series, np.ndarray): Target data. Returns: - Tuple(pd.DataFrame, pd.Series): Transformed X and y + Tuple(pd.DataFrame, pd.Series): Transformed X and y. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): From 5e0da3f3db379c3616a59869dec1015ef0d7fe83 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 8 Sep 2021 13:01:59 -0400 Subject: [PATCH 27/62] try darglint --- dev-requirements.txt | 1 + .../data_splitters/time_series_split.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 78258aa136..9d805135e0 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,3 +5,4 @@ flake8==3.7.0 black==21.5b1 isort==5.0.0 pydocstyle==6.1.1 +darglint==1.8.0 \ No newline at end of file diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index c9e22f5457..d6033c994c 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -30,7 +30,16 @@ def __init__(self, max_delay=0, gap=0, date_index=None, n_splits=3): self._splitter = SkTimeSeriesSplit(n_splits=n_splits) def get_n_splits(self, X=None, y=None, groups=None): - """Get the number of data splits.""" + """Get the number of data splits. + + Args: + X (pd.DataFrame, None): Features to split. + y (pd.DataFrame, None): Target variable to split. Defaults to None. + groups: Ignored but kept for compatibility with sklearn API. Defaults to None. + + Returns: + Number of splits. + """ return self._splitter.n_splits @staticmethod @@ -49,8 +58,11 @@ def split(self, X, y=None, groups=None): y (pd.DataFrame, None): Target variable to split. Defaults to None. groups: Ignored but kept for compatibility with sklearn API. Defaults to None. - Returns: + Yields: Iterator of (train, test) indices tuples. + + Raises: + ValueError: If one of the proposed splits would be empty. """ # Sklearn splitters always assume a valid X is passed but we need to support the # TimeSeriesPipeline convention of being able to pass in empty X dataframes From c36071c448768392eee42c419c8a14700637026a Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 8 Sep 2021 18:29:47 -0400 Subject: [PATCH 28/62] some fixing linting --- Makefile | 1 + .../automl_algorithm/automl_algorithm.py | 2 +- .../automl_algorithm/iterative_algorithm.py | 4 +-- evalml/automl/automl_search.py | 6 ++-- evalml/automl/engine/cf_engine.py | 13 +++----- evalml/automl/engine/dask_engine.py | 15 +++++----- evalml/automl/engine/engine_base.py | 8 ++++- evalml/automl/engine/sequential_engine.py | 20 +++++++++++-- evalml/data_checks/data_check_action.py | 5 ++++ evalml/data_checks/data_check_message.py | 6 ++++ .../target_distribution_data_check.py | 1 - .../data_checks/target_leakage_data_check.py | 5 ++-- evalml/model_understanding/force_plots.py | 10 +++---- evalml/model_understanding/graphs.py | 8 +++-- .../permutation_importance.py | 3 ++ .../_user_interface.py | 30 +++++++++++++------ .../prediction_explanations/explainers.py | 14 +++++---- .../balanced_classification_sampler.py | 2 +- .../data_splitters/time_series_split.py | 4 +-- .../training_validation_split.py | 6 +++- evalml/preprocessing/utils.py | 3 +- setup.cfg | 5 +++- 22 files changed, 114 insertions(+), 57 deletions(-) diff --git a/Makefile b/Makefile index 8e7f60bcb1..d853764f0b 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ clean: lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' + darglint evalml/ --strictness=short black evalml -t py39 --check .PHONY: lint-fix diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index ae1b6a7143..430282bbdb 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -54,7 +54,7 @@ def next_batch(self): """Get the next batch of pipelines to evaluate. Returns: - list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. + list[PipelineBase]: a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index ef098d619e..6c09beb262 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -69,7 +69,7 @@ def __init__( text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None. - _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER. + _estimator_family_order (list[ModelFamily]): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER. """ self._estimator_family_order = ( _estimator_family_order or _ESTIMATOR_FAMILY_ORDER @@ -134,7 +134,7 @@ def next_batch(self): """Get the next batch of pipelines to evaluate. Returns: - list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. + list[PipelineBase]: a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ if self._batch_number == 1: if len(self._first_batch_results) == 0: diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 0fb917acdf..b952ecdfee 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -1602,7 +1602,7 @@ def train_pipelines(self, pipelines): This can be helpful for training pipelines once the search is complete. Args: - pipelines (list(PipelineBase)): List of pipelines to train. + pipelines (list[PipelineBase]): List of pipelines to train. Returns: Dict[str, PipelineBase]: Dictionary keyed by pipeline name that maps to the fitted pipeline. @@ -1642,10 +1642,10 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): """Score a list of pipelines on the given holdout data. Args: - pipelines (list(PipelineBase)): List of pipelines to train. + pipelines (list[PipelineBase]): List of pipelines to train. X_holdout (pd.DataFrame): Holdout features. y_holdout (pd.Series): Holdout targets for scoring. - objectives (list(str), list(ObjectiveBase)): Objectives used for scoring. + objectives (list[str], list[ObjectiveBase]): Objectives used for scoring. Returns: dict[str, Dict[str, float]]: Dictionary keyed by pipeline name that maps to a dictionary of scores. diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index 3a5b43594f..c6fd236227 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -1,9 +1,6 @@ -<<<<<<< HEAD """Custom CFClient API to match Dask's CFClient and allow context management.""" -======= from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor ->>>>>>> main from evalml.automl.engine.engine_base import ( EngineBase, EngineComputation, @@ -107,7 +104,7 @@ def __init__(self, client=None): self.client = client self._data_futures_cache = {} - def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: + def submit_evaluation_job(self, automl_config, pipeline, X, y): """Send evaluation job to cluster. Args: @@ -131,7 +128,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat ) return CFComputation(future) - def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: + def submit_training_job(self, automl_config, pipeline, X, y): """Send training job to cluster. Args: @@ -149,9 +146,7 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio ) return CFComputation(future) - def submit_scoring_job( - self, automl_config, pipeline, X, y, objectives - ) -> EngineComputation: + def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): """Send scoring job to cluster. Args: @@ -159,7 +154,7 @@ def submit_scoring_job( pipeline (pipeline.PipelineBase): Pipeline to train. X (pd.DataFrame): Input data for modeling. y (pd.Series): Target data for modeling. - objectives (list(ObjectiveBase)): Objectives to score on. + objectives (list[ObjectiveBase]): Objectives to score on. Returns: CFComputation: An object wrapping a reference to a future-like computation diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index f666a0d118..b045b079de 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -30,7 +30,10 @@ def get_result(self): """Gets the computation result. Will block until the computation is finished. Raises: - Exception: If computation fails. Returns traceback. + Exception: If computation fails. Returns traceback. + + Returns: + Computation results. """ return self.work.result() @@ -92,7 +95,7 @@ def send_data_to_cluster(self, X, y): ) return self._data_futures_cache[data_hash] - def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: + def submit_evaluation_job(self, automl_config, pipeline, X, y): """Send evaluation job to cluster. Args: @@ -117,7 +120,7 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat ) return DaskComputation(dask_future) - def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: + def submit_training_job(self, automl_config, pipeline, X, y): """Send training job to cluster. Args: @@ -136,9 +139,7 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio ) return DaskComputation(dask_future) - def submit_scoring_job( - self, automl_config, pipeline, X, y, objectives - ) -> EngineComputation: + def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): """Send scoring job to cluster. Args: @@ -146,7 +147,7 @@ def submit_scoring_job( pipeline (pipeline.PipelineBase): Pipeline to train. X (pd.DataFrame): Input data for modeling. y (pd.Series): Target data for modeling. - objectives (list(ObjectiveBase)): List of objectives to score on. + objectives (list[ObjectiveBase]): List of objectives to score on. Returns: DaskComputation: An object wrapping a reference to a future-like computation diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index 0e9acabaad..19a6a2b265 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -153,6 +153,10 @@ def train_and_score_pipeline( automl_config (AutoMLSearch): The AutoMLSearch object, used to access config and the error callback. full_X_train (pd.DataFrame): Training features. full_y_train (pd.Series): Training target. + logger: Logger object to write to. + + Raises: + Exception: If there are missing target values in the training set after data split. Returns: tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. @@ -296,6 +300,7 @@ def evaluate_pipeline(pipeline, automl_config, X, y, logger): automl_config (AutoMLConfig): The AutoMLSearch object, used to access config and the error callback. X (pd.DataFrame): Training features. y (pd.Series): Training target. + logger: Logger object to write to. Returns: tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. @@ -322,11 +327,12 @@ def score_pipeline(pipeline, X, y, objectives, X_schema=None, y_schema=None): pipeline (PipelineBase): The pipeline to score. X (pd.DataFrame): Features to score on. y (pd.Series): Target used to calcualte scores. + objectives (list[ObjectiveBase]): List of objectives to score on. X_schema (ww.TableSchema): Schema for features. Defaults to None. y_schema (ww.ColumnSchema): Schema for columns. Defaults to None. Returns: - dict containing pipeline scores. + dict: Dictionary object containing pipeline scores. """ if X_schema: X.ww.init(schema=X_schema) diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 07074e2350..3bbc23c4a1 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -28,7 +28,11 @@ def __init__(self, work, **kwargs): self.meta_data = {} def done(self): - """Whether the computation is done.""" + """Whether the computation is done. + + Returns: + bool: Always returns True. + """ return True def get_result(self): @@ -36,6 +40,9 @@ def get_result(self): Raises: Exception: If computation fails. Returns traceback. + + Returns: + Computation results. """ return self.work(**self.kwargs) @@ -57,6 +64,9 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y): pipeline (pipeline.PipelineBase): Pipeline to evaluate. X (pd.DataFrame): Input data for modeling. y (pd.Series): Target data for modeling. + + Returns: + SequentialComputation: Computation result. """ logger = self.setup_job_log() return SequentialComputation( @@ -76,6 +86,9 @@ def submit_training_job(self, automl_config, pipeline, X, y): pipeline (pipeline.PipelineBase): Pipeline to evaluate. X (pd.DataFrame): Input data for modeling. y (pd.Series): Target data for modeling. + + Returns: + SequentialComputation: Computation result. """ return SequentialComputation( work=train_pipeline, @@ -94,7 +107,10 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): pipeline (pipeline.PipelineBase): Pipeline to train. X (pd.DataFrame): Input data for modeling. y (pd.Series): Target data for modeling. - objectives (list(ObjectiveBase)): List of objectives to score on. + objectives (list[ObjectiveBase]): List of objectives to score on. + + Returns: + SequentialComputation: Computation result. """ objectives = [get_objective(o, return_instance=True) for o in objectives] computation = SequentialComputation( diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index e0a26ce3e8..5802732819 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -18,6 +18,11 @@ def __eq__(self, other): Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent. + Args: + other: An object to compare equality with. + + Returns: + bool: True if the other object is considered an equivalent data check action, False otherwise. """ return self.action_code == other.action_code and self.metadata == other.metadata diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index d7cc709300..228bbb0108 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -29,6 +29,12 @@ def __eq__(self, other): Two DataCheckMessage objs are considered equivalent if all of their attributes are equivalent. + + Args: + other: An object to compare equality with. + + Returns: + bool: True if the other object is considered an equivalent data check message, False otherwise. """ return ( self.message_type == other.message_type diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py index 4edb5e795b..52d7807d48 100644 --- a/evalml/data_checks/target_distribution_data_check.py +++ b/evalml/data_checks/target_distribution_data_check.py @@ -21,7 +21,6 @@ def validate(self, X, y): """Check if the target data has a certain distribution. Args: - X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 0bd4d4628d..c2163ddcbe 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -70,9 +70,8 @@ def validate(self, X, y): Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Args: - - X (pd.DataFrame, np.ndarray): The input features to check - y (pd.Series, np.ndarray): The target data + X (pd.DataFrame, np.ndarray): The input features to check. + y (pd.Series, np.ndarray): The target data. Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. diff --git a/evalml/model_understanding/force_plots.py b/evalml/model_understanding/force_plots.py index 5f9a337e32..c129dd07c1 100644 --- a/evalml/model_understanding/force_plots.py +++ b/evalml/model_understanding/force_plots.py @@ -22,7 +22,7 @@ def graph_force_plot(pipeline, rows_to_explain, training_data, y, matplotlib=Fal Defaults to False. Returns: - list(dict(shap.AdditiveForceVisualizer)): The same as force_plot(), but with an additional + list[dict[shap.AdditiveForceVisualizer]]: The same as force_plot(), but with an additional key in each dictionary for the plot. """ @@ -62,12 +62,12 @@ def force_plot(pipeline, rows_to_explain, training_data, y): Args: pipeline (PipelineBase): The pipeline to generate the force plot for. - rows_to_explain (list(int)): A list of the indices of the training_data to explain. + rows_to_explain (list[int]): A list of the indices of the training_data to explain. training_data (pandas.DataFrame): The data used to train the pipeline. y (pandas.Series): The target data. Returns: - list(dict()): list of dictionaries where each dict contains force plot data. Each dictionary + list[dict]: list of dictionaries where each dict contains force plot data. Each dictionary entry represents the explanations for a single row. For single row binary force plots: @@ -87,8 +87,8 @@ def force_plot(pipeline, rows_to_explain, training_data, y): 'plot': AdditiveForceVisualizer}] Raises: - TypeError: if rows_to_explain is not a list. - TypeError: if all values in rows_to_explain aren't integers. + TypeError: If rows_to_explain is not a list. + TypeError: If all values in rows_to_explain aren't integers. """ if not isinstance(rows_to_explain, list): raise TypeError( diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index fffbe5283d..5d946f2ab2 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1358,6 +1358,10 @@ def visualize_decision_tree( Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. + + Raises: + ValueError: If the estimator is not a decision tree estimator. + NotFittedError: If the estimator is not fitted yet. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1574,7 +1578,7 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE . + """Plot high dimensional data into lower dimensional space using t-SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. @@ -1588,7 +1592,7 @@ def graph_t_sne( marker_size (int, optional): Determines the size of the marker. Returns: - plotly.Figure representing the transformed data + plotly.Figure: Figure representing the transformed data. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index 0f715911e2..9f8295f3a6 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -25,6 +25,9 @@ def calculate_permutation_importance( Returns: pd.DataFrame: Mean feature importance scores over a number of shuffles. + + Raises: + ValueError: If objective cannot be used with the given pipeline. """ X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index cc59f29024..9c6b62bb30 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -181,13 +181,7 @@ def make_drill_down_dict( original_features, include_shap_values, ): - """Format the 'drill_down' section of the explanation report when output_format="dict". - - This section will include the feature values, feature names, - qualitative explanation and shap values (if - include_shap_values=True) for the features created from one of - the original features in the data. - """ + """Format the 'drill_down' section of the explanation report when output_format="dict". This section will include the feature values, feature names, qualitative explanation and shap values (if include_shap_values=True) for the features created from one of the original features in the data.""" drill_down = {} for parent_feature, children_features in provenance.items(): shap_for_children = { @@ -596,16 +590,22 @@ def make_text(self, rank): Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. + + Returns: + The heading section for reports formatted as text. """ prefix = self.prefixes[(rank // self.n_indices)] rank = rank % self.n_indices return [f"\t{prefix}{rank + 1} of {self.n_indices}\n\n"] def make_dict(self, rank): - """Makes the heading section for reports formatted as dictionaries. + """Makes the heading section for reports formatted as a dictionary. Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. + + Returns: + The heading section for reports formatted as a dictionary. """ prefix = self.prefixes[(rank // self.n_indices)] rank = rank % self.n_indices @@ -616,6 +616,9 @@ def make_dataframe(self, rank): Args: rank (int): Rank (1, 2, 3, ...) of the prediction. Used to say "Best 1 of 5", "Worst 1 of 5", etc. + + Returns: + The heading section for reports formatted as a dictionary. """ return self.make_dict(rank) @@ -640,6 +643,9 @@ def make_text(self, index, y_pred, y_true, scores, dataframe_index): scores (np.ndarray): Scores on the entire dataset. dataframe_index (pd.Series): pandas index for the entire dataset. Used to display the index in the data each explanation belongs to. + + Returns: + The predicted values section for classification problem best/worst reports formatted as text. """ pred_value = [ f"{col_name}: {pred}" @@ -693,6 +699,9 @@ def make_text(self, index, y_pred, y_true, scores, dataframe_index): scores (pd.Series): Scores on the entire dataset. dataframe_index (pd.Series): pandas index for the entire dataset. Used to display the index in the data each explanation belongs to. + + Returns: + The predicted values section for regression problem best/worst reports formatted as text. """ return [ f"\t\tPredicted Value: {round(y_pred.iloc[index], 3)}\n", @@ -738,6 +747,9 @@ def make_text(self, index, pipeline, pipeline_features, input_features): pipeline (PipelineBase): The pipeline to explain. pipeline_features (pd.DataFrame): The dataframe of features created by the pipeline. input_features (pd.Dataframe): The dataframe of features passed to the pipeline. + + Returns: + The SHAP table section for reports formatted as text. """ table = _make_single_prediction_shap_table( pipeline, @@ -803,7 +815,7 @@ def make_text(self, data): """Make a prediction explanation report that is formatted as text. Args: - data (_ReportData): Data passed in by the user. + data (_ReportData): Data passed in by the user. Returns: str diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index 1f0cf85659..9cc8406af6 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -52,7 +52,7 @@ def explain_predictions( pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Dataframe of input data to evaluate the pipeline on. y (pd.Series): Labels for the input data. - indices_to_explain (list(int)): List of integer indices to explain. + indices_to_explain (list[int]): List of integer indices to explain. top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each data point. Default is 3. include_shap_values (bool): Whether SHAP values should be included in the table. Default is False. @@ -60,7 +60,7 @@ def explain_predictions( output_format (str): Either "text", "dict", or "dataframe". Default is "text". Returns: - str, dict, or pd.DataFrame - A report explaining the top contributing features to each prediction for each row of input_features. + str, dict, or pd.DataFrame: A report explaining the top contributing features to each prediction for each row of input_features. The report will include the feature names, prediction contribution, and SHAP Value (optional). Raises: @@ -161,15 +161,17 @@ def explain_predictions_best_worst( callback (callable): Function to be called with incremental updates. Has the following parameters: - progress_stage: stage of computation - time_elapsed: total time in seconds that has elapsed since start of call + Returns: - str, dict, or pd.DataFrame - A report explaining the top contributing features for the best/worst predictions in the input_features. + str, dict, or pd.DataFrame: A report explaining the top contributing features for the best/worst predictions in the input_features. For each of the best/worst rows of input_features, the predicted values, true labels, metric value, feature names, prediction contribution, and SHAP Value (optional) will be listed. Raises: - ValueError: if input_features does not have more than twice the requested features to explain. - ValueError: if y_true and input_features have mismatched lengths. - ValueError: if an output_format outside of "text", "dict" or "dataframe is provided. + ValueError: If input_features does not have more than twice the requested features to explain. + ValueError: If y_true and input_features have mismatched lengths. + ValueError: If an output_format outside of "text", "dict" or "dataframe is provided. + PipelineScoreError: If the pipeline errors out while scoring. """ start_time = timer() _update_progress( diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py index 77fa844910..b1de501847 100644 --- a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py +++ b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py @@ -55,7 +55,7 @@ def _find_ideal_samples(self, y): Args: y (pd.Series): Target data passed in. - Returns + Returns: (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """ diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index d6033c994c..c75996eddc 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -31,12 +31,12 @@ def __init__(self, max_delay=0, gap=0, date_index=None, n_splits=3): def get_n_splits(self, X=None, y=None, groups=None): """Get the number of data splits. - + Args: X (pd.DataFrame, None): Features to split. y (pd.DataFrame, None): Target variable to split. Defaults to None. groups: Ignored but kept for compatibility with sklearn API. Defaults to None. - + Returns: Number of splits. """ diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index 54b9c3914f..276ad4b54b 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -34,7 +34,11 @@ def __init__( @staticmethod def get_n_splits(): - """Return the number of splits of this object.""" + """Return the number of splits of this object. + + Returns: + int: Always returns 1. + """ return 1 def split(self, X, y=None): diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 01e8db0e54..1055d946da 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -21,9 +21,10 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg n_rows (int): Number of rows to return. Defaults to None. drop (list): List of columns to drop. Defaults to None. verbose (bool): If True, prints information about features and target. Defaults to True. + **kwargs: Other keyword arguments that should be passed to panda's `read_csv` method. Returns: - pd.DataFrame, pd.Series: Features matrix and target. + pd.DataFrame, pd.Series: Features matrix and target. """ feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs) diff --git a/setup.cfg b/setup.cfg index df64a8b8d2..c9431f1a59 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,4 +19,7 @@ test=pytest profile=black forced_separate=evalml multi_line_output=3 -skip=__init__.py \ No newline at end of file +skip=__init__.py +[darglint] +ignore=DAR402 +ignore_regex=^_(.*) From b980a8e4966ca52db74216f8cb5e28d357f0afeb Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 8 Sep 2021 23:07:27 -0400 Subject: [PATCH 29/62] more fixing --- Makefile | 2 +- .../automl_algorithm/automl_algorithm.py | 5 +- .../automl_algorithm/iterative_algorithm.py | 8 ++- evalml/automl/automl_search.py | 68 +++++++++---------- evalml/automl/callbacks.py | 15 ++++ evalml/automl/pipeline_search_plots.py | 3 + evalml/automl/utils.py | 10 +-- .../permutation_importance.py | 4 ++ evalml/tuners/skopt_tuner.py | 4 ++ setup.cfg | 1 + 10 files changed, 78 insertions(+), 42 deletions(-) diff --git a/Makefile b/Makefile index d853764f0b..16a7f6ca4d 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ clean: lint: flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' - darglint evalml/ --strictness=short + darglint evalml/ black evalml -t py39 --check .PHONY: lint-fix diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 430282bbdb..4b3579691f 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -54,7 +54,7 @@ def next_batch(self): """Get the next batch of pipelines to evaluate. Returns: - list[PipelineBase]: a list of instances of PipelineBase subclasses, ready to be trained and evaluated. + list[PipelineBase]: A list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): @@ -64,6 +64,9 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. + + Raises: + PipelineNotFoundError: If pipeline is not allowed in search. """ if pipeline.name not in self._tuners: raise PipelineNotFoundError( diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 6c09beb262..a74a60ebd6 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -134,7 +134,10 @@ def next_batch(self): """Get the next batch of pipelines to evaluate. Returns: - list[PipelineBase]: a list of instances of PipelineBase subclasses, ready to be trained and evaluated. + list[PipelineBase]: A list of instances of PipelineBase subclasses, ready to be trained and evaluated. + + Raises: + AutoMLAlgorithmException: If no results were reported from the first batch. """ if self._batch_number == 1: if len(self._first_batch_results) == 0: @@ -203,6 +206,9 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. + + Raises: + ValueError: If default parameters are not in the acceptable hyperparameter ranges. """ if pipeline.model_family != ModelFamily.ENSEMBLE: if self.batch_number == 1: diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index b952ecdfee..886452f8d6 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -81,6 +81,8 @@ def build_engine_from_str(engine_str): Returns: (EngineBase): Instance of the requested engine. + Raises: + ValueError: If engine_str is not a valid engine. """ valid_engines = [ "sequential", @@ -119,41 +121,34 @@ def search( """Given data and configuration, run an automl search. This method will run EvalML's default suite of data checks. If the data checks produce errors, the data check results will be returned before running the automl search. In that case we recommend you alter your data to address these errors and try again. - This method is provided for convenience. If you'd like more control over when each of these steps is run, consider making calls directly to the various pieces like the data checks and AutoMLSearch, instead of using this method. Args: X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. - y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. - problem_type (str or ProblemTypes): Type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - - LogLossBinary for binary classification problems, - - LogLossMulticlass for multiclass classification problems, and - - R2 for regression problems. - + - LogLossBinary for binary classification problems, + - LogLossMulticlass for multiclass classification problems, and + - R2 for regression problems. mode (str): mode for DefaultAlgorithm. There are two modes: fast and long, where fast is a subset of long. Please look at DefaultAlgorithm for more details. - max_time (int, str): Maximum time to search for pipelines. This will not start a new pipeline search after the duration has elapsed. If it is an integer, then the time will be in seconds. For strings, time can be specified as seconds, minutes, or hours. - patience (int): Number of iterations without improvement to stop search early. Must be positive. If None, early stopping is disabled. Defaults to None. - tolerance (float): Minimum percentage difference to qualify as score improvement for early stopping. Only applicable if patience is not None. Defaults to None. - problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. Returns: (AutoMLSearch, dict): The automl search object containing pipelines and rankings, and the results from running the data checks. If the data check results contain errors, automl search will not be run and an automl search object will not be returned. + + Raises: + ValueError: If search configuration is not valid. """ X_train = infer_feature_types(X_train) y_train = infer_feature_types(y_train) @@ -219,30 +214,26 @@ def search_iterative( """Given data and configuration, run an automl search. This method will run EvalML's default suite of data checks. If the data checks produce errors, the data check results will be returned before running the automl search. In that case we recommend you alter your data to address these errors and try again. - This method is provided for convenience. If you'd like more control over when each of these steps is run, consider making calls directly to the various pieces like the data checks and AutoMLSearch, instead of using this method. Args: X_train (pd.DataFrame): The input training data of shape [n_samples, n_features]. Required. - y_train (pd.Series): The target training data of length [n_samples]. Required for supervised learning tasks. - problem_type (str or ProblemTypes): Type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. - objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - LogLossBinary for binary classification problems, - LogLossMulticlass for multiclass classification problems, and - R2 for regression problems. - problem_configuration (dict): Additional parameters needed to configure the search. For example, - in time series problems, values should be passed in for the date_index, gap, and max_delay variables. - - Other keyword arguments which are provided will be passed to AutoMLSearch. + in time series problems, values should be passed in for the date_index, gap, and max_delay variables. + **kwargs: Other keyword arguments which are provided will be passed to AutoMLSearch. Returns: (AutoMLSearch, dict): the automl search object containing pipelines and rankings, and the results from running the data checks. If the data check results contain errors, automl search will not be run and an automl search object will not be returned. + + Raises: + ValueError: If the search configuration is invalid. """ X_train = infer_feature_types(X_train) y_train = infer_feature_types(y_train) @@ -955,11 +946,11 @@ def search(self, show_iteration_plot=True): """Find the best pipeline for the data set. Args: - feature_types (list, optional): list of feature types, either numerical or categorical. - Categorical features will automatically be encoded - show_iteration_plot (boolean, True): Shows an iteration vs. score plot in Jupyter notebook. Disabled by default in non-Jupyter enviroments. + + Raises: + AutoMLSearchException: If all pipelines in the current AutoML batch produced a score of np.nan on the primary objective. """ if self._searched: logger.info( @@ -1370,10 +1361,13 @@ def get_pipeline(self, pipeline_id): """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline initialized with the parameters used to train that pipeline during automl search. Args: - pipeline_id (int): Pipeline to retrieve + pipeline_id (int): Pipeline to retrieve. Returns: - PipelineBase: Untrained pipeline instance associated with the provided ID + PipelineBase: Untrained pipeline instance associated with the provided ID. + + Raises: + PipelineNotFoundError: if pipeline_id is not a valid ID. """ pipeline_results = self.results["pipeline_results"].get(pipeline_id) if pipeline_results is None: @@ -1397,6 +1391,9 @@ def describe_pipeline(self, pipeline_id, return_dict=False): Returns: Description of specified pipeline. Includes information such as type of pipeline components, problem, training time, cross validation, etc. + + Raises: + PipelineNotFoundError: If pipeline_id is not a valid ID. """ if pipeline_id not in self._results["pipeline_results"]: raise PipelineNotFoundError("Pipeline not found") @@ -1543,6 +1540,9 @@ def best_pipeline(self): Returns: PipelineBase: A trained instance of the best pipeline and parameters found during automl search. If `train_best_pipeline` is set to False, returns an untrained pipeline instance. + + Raises: + PipelineNotFoundError: If this is called before .search() is called. """ if not self._best_pipeline: raise PipelineNotFoundError( @@ -1560,12 +1560,12 @@ def save( """Saves AutoML object at file path. Args: - file_path (str): location to save file - pickle_type {"pickle", "cloudpickle"}: the pickling library to use. - pickle_protocol (int): the pickle data stream format. + file_path (str): Location to save file. + pickle_type ({"pickle", "cloudpickle"}): The pickling library to use. + pickle_protocol (int): The pickle data stream format. - Returns: - None + Raises: + ValueError: If pickle_type is not "pickle" or "cloudpickle". """ if pickle_type == "cloudpickle": pkl_lib = cloudpickle @@ -1587,8 +1587,8 @@ def load( """Loads AutoML object at file path. Args: - file_path (str): location to find file to load - pickle_type {"pickle", "cloudpickle"}: the pickling library to use. Currently not used since the standard pickle library can handle cloudpickles. + file_path (str): Location to find file to load + pickle_type ({"pickle", "cloudpickle"}): The pickling library to use. Currently not used since the standard pickle library can handle cloudpickles. Returns: AutoSearchBase object diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index d4b030869b..bf640e9a91 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -13,6 +13,15 @@ def raise_error_callback(exception, traceback, automl, **kwargs): """Raises the exception thrown by the AutoMLSearch object. Also logs the exception as an error. + + Args: + exception: Exception to log and raise. + traceback: Exception traceback to log. + automl: AutoMLSearch object. + **kwargs: Other relevant keyword arguments to log. + + Raises: + exception: Raises the input exception. """ logger.error(f"AutoML search raised a fatal exception: {str(exception)}") logger.error("\n".join(traceback)) @@ -23,6 +32,12 @@ def log_error_callback(exception, traceback, automl, **kwargs): """Logs the exception thrown as an error. Will not throw. This is the default behavior for AutoMLSearch. + + Args: + exception: Exception to log. + traceback: Exception traceback to log. + automl: AutoMLSearch object. + **kwargs: Other relevant keyword arguments to log. """ fold_num = kwargs.get("fold_num") pipeline = kwargs.get("pipeline") diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index c8cca357b6..c2fafbeb59 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -101,6 +101,9 @@ def __init__(self, results, objective): def search_iteration_plot(self, interactive_plot=False): """Shows a plot of the best score at each iteration using data gathered during training. + Args: + interactive_plot (bool): Whether or not to show an interactive plot. Defaults to False. + Returns: plot """ diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 9c143c7f7a..0aad2cb020 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -74,6 +74,9 @@ def make_data_splitter( Returns: sklearn.model_selection.BaseCrossValidator: Data splitting method. + + Raises: + ValueError: If problem_configuration is not given for a time-series problem. """ random_seed = random_seed problem_type = handle_problem_types(problem_type) @@ -130,13 +133,10 @@ def check_all_pipeline_names_unique(pipelines): """Checks whether all the pipeline names are unique. Args: - pipelines (list(PipelineBase)): List of pipelines to check if all names are unique. - - Returns: - None + pipelines (list[PipelineBase]): List of pipelines to check if all names are unique. Raises: - ValueError: if any pipeline names are duplicated. + ValueError: If any pipeline names are duplicated. """ name_count = pd.Series([p.name for p in pipelines]).value_counts() duplicate_names = name_count[name_count > 1].index.tolist() diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index 9f8295f3a6..80f49c140f 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -94,6 +94,10 @@ def calculate_permutation_importance_one_column( Returns: float: Mean feature importance scores over a number of shuffles. + + Raises: + ValueError: If pipeline does not support fast permutation importance calculation. + ValueError: If precomputed_features is None. """ X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 3310774963..e3b06ff631 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -38,6 +38,10 @@ def add(self, pipeline_parameters, score): Returns: None + + Raises: + Exception: If skopt tuner errors. + ParameterError: If skopt receives invalid parameters. """ # skip adding nan scores if pd.isnull(score): diff --git a/setup.cfg b/setup.cfg index c9431f1a59..6d47d023ae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,3 +23,4 @@ skip=__init__.py [darglint] ignore=DAR402 ignore_regex=^_(.*) +strictness=short \ No newline at end of file From f8255f73958935e05636c82cfa9b135c3a560419 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 01:12:52 -0400 Subject: [PATCH 30/62] some more cleanup --- evalml/automl/automl_search.py | 7 +++---- .../estimators/regressors/baseline_regressor.py | 3 +++ .../regressors/time_series_baseline_estimator.py | 10 ++++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 886452f8d6..d6702c967a 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -129,9 +129,9 @@ def search( problem_type (str or ProblemTypes): Type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - LogLossBinary for binary classification problems, - - LogLossMulticlass for multiclass classification problems, and - - R2 for regression problems. + - LogLossBinary for binary classification problems, + - LogLossMulticlass for multiclass classification problems, and + - R2 for regression problems. mode (str): mode for DefaultAlgorithm. There are two modes: fast and long, where fast is a subset of long. Please look at DefaultAlgorithm for more details. max_time (int, str): Maximum time to search for pipelines. This will not start a new pipeline search after the duration @@ -293,7 +293,6 @@ class AutoMLSearch: objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - LogLossBinary for binary classification problems, - LogLossMulticlass for multiclass classification problems, and - R2 for regression problems. diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index 59e3dc8ee4..e930541c05 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -53,6 +53,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If input y is None. """ if y is None: raise ValueError("Cannot fit Baseline regressor if y is None") diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index 49a05afc79..79f6273d29 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -72,10 +72,13 @@ def predict(self, X, y=None): Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. - y (pd.Series): Target data. + y (pd.Series): Target data. Defaults to None. Returns: pd.Series: Predicted values. + + Raises: + ValueError: If input y is None. """ if y is None: raise ValueError( @@ -93,10 +96,13 @@ def predict_proba(self, X, y=None): Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. - y (pd.Series): Target data. + y (pd.Series): Target data. Defaults to None. Returns: pd.DataFrame: Predicted probability values. + + Raises: + ValueError: If input y is None. """ if y is None: raise ValueError( From f3491f983dad68322adf6c868aaedf90b51caea1 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 13:58:59 -0400 Subject: [PATCH 31/62] more cleanup --- evalml/model_family/utils.py | 5 ++++- evalml/model_understanding/graphs.py | 21 ++++++++++++++----- evalml/pipelines/component_graph.py | 15 +++++++++++++ evalml/pipelines/components/component_base.py | 7 ++----- .../components/estimators/estimator.py | 3 +++ .../estimators/regressors/arima_regressor.py | 6 ++++++ .../dimensionality_reduction/lda.py | 9 ++++++++ .../dimensionality_reduction/pca.py | 9 ++++++++ .../transformers/imputers/target_imputer.py | 10 +++++++-- .../preprocessing/drop_rows_transformer.py | 3 +++ .../preprocessing/polynomial_detrender.py | 3 +++ evalml/pipelines/components/utils.py | 18 ++++++++++++---- evalml/pipelines/pipeline_base.py | 13 +++++++++--- evalml/pipelines/pipeline_meta.py | 7 +++++-- evalml/pipelines/utils.py | 5 ++++- evalml/utils/gen_utils.py | 6 ++---- 16 files changed, 113 insertions(+), 27 deletions(-) diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index 0a289a6dbe..3b95b9491d 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -6,10 +6,13 @@ def handle_model_family(model_family): """Handles model_family by either returning the ModelFamily or converting from a string. Args: - model_family (str or ModelFamily): Model type that needs to be handled + model_family (str or ModelFamily): Model type that needs to be handled. Returns: ModelFamily + + Raises: + KeyError: If input is not a valid model family. """ if isinstance(model_family, str): try: diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 5d946f2ab2..bb5d34ca5f 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1459,6 +1459,9 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): Returns: plotly.Figure: Showing the prediction vs actual over time. + + Raises: + ValueError: If the pipeline is not a time-series regression pipeline. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1507,6 +1510,10 @@ def get_linear_coefficients(estimator, features=None): Returns: pd.DataFrame: Displaying the features by importance. + + Raises: + ValueError: If the model is not a linear model. + NotFittedError: If the model is not yet fitted. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: raise ValueError( @@ -1541,9 +1548,9 @@ def t_sne( X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. Returns: @@ -1583,16 +1590,20 @@ def graph_t_sne( Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning. + algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. + **kwargs: Additional keyword arguments to pass. Returns: plotly.Figure: Figure representing the transformed data. + + Raises: + ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index d202202a51..b554a83a37 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -448,6 +448,9 @@ def get_component(self, component_name): Returns: ComponentBase object + + Raises: + ValueError: If the component is not in the graph. """ try: return self.component_instances[component_name] @@ -459,6 +462,9 @@ def get_last_component(self): Returns: ComponentBase object + + Raises: + ValueError: If the component graph has no edges. """ if len(self.compute_order) == 0: raise ValueError("Cannot get last component from edgeless graph") @@ -470,6 +476,9 @@ def get_estimators(self): Returns: list: All estimator objects within the graph. + + Raises: + ValueError: If the component graph is not yet instantiated. """ if not isinstance(self.get_last_component(), ComponentBase): raise ValueError( @@ -489,6 +498,9 @@ def get_inputs(self, component_name): Returns: list[str]: List of inputs for the component to use. + + Raises: + KeyError: If the component is not in the graph. """ try: component_info = self.component_dict[component_name] @@ -530,6 +542,9 @@ def graph(self, name=None, graph_format=None): Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. + + Raises: + RuntimeError: If graphviz is not installed. """ graphviz = import_or_raise( "graphviz", error_msg="Please install graphviz to visualize pipelines." diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index decf70fe36..60342768ab 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -163,11 +163,8 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves component at file path. Args: - file_path (str): Location to save file + file_path (str): Location to save file. pickle_protocol (int): The pickle data stream format. - - Returns: - None """ with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol) @@ -177,7 +174,7 @@ def load(file_path): """Loads component at file path. Args: - file_path (str): Location to load file + file_path (str): Location to load file. Returns: ComponentBase object diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 11ae564166..5425347f3c 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -80,6 +80,9 @@ def predict(self, X): Returns: pd.Series: Predicted values. + + Raises: + AttributeError: If estimator does not have a predict method or a component_obj that implements predict. """ try: X = infer_feature_types(X) diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 9f4956ba39..8287c19647 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -166,6 +166,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If X was passed to `fit` but not passed in `predict`. """ if y is None: raise ValueError("ARIMA Regressor requires y as input.") @@ -189,6 +192,9 @@ def predict(self, X, y=None): Returns: pd.Series: Predicted values. + + Raises: + ValueError: If X was passed to `fit` but not passed in `predict`. """ X, y = self._manage_woodwork(X, y) dates, X = self._get_dates(X, y) diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index 97d00bbdb3..effe66d053 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -43,6 +43,9 @@ def fit(self, X, y): Returns: self + + Raises: + ValueError: If input data is not all numeric. """ X = infer_feature_types(X) if not is_all_numeric(X): @@ -66,6 +69,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed data. + + Raises: + ValueError: If input data is not all numeric. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): @@ -87,6 +93,9 @@ def fit_transform(self, X, y=None): Returns: pd.DataFrame: Transformed data. + + Raises: + ValueError: If input data is not all numeric. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 049e14bd9e..4521bff1cd 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -46,6 +46,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If input data is not all numeric. """ X = infer_feature_types(X) if not is_all_numeric(X): @@ -62,6 +65,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed data. + + Raises: + ValueError: If input data is not all numeric. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): @@ -83,6 +89,9 @@ def fit_transform(self, X, y=None): Returns: pd.DataFrame: Transformed data. + + Raises: + ValueError: If input data is not all numeric. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index 2875086d2c..8e805061bf 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -20,8 +20,11 @@ class TargetImputerMeta(ComponentBaseMeta): def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the - wrapped method if `True`. + Args: + method (callable): Method to wrap. + + Raises: + ComponentNotYetFittedError: If component is not fitted. """ @wraps(method) @@ -75,6 +78,9 @@ def fit(self, X, y): Returns: self + + Raises: + TypeError: If target is filled with all null values. """ if y is None: return self diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py index b7cf01f0f0..8b6301af74 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_rows_transformer.py @@ -33,6 +33,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If indices to drop do not exist in input features or target. """ X_t = infer_feature_types(X) y_t = infer_feature_types(y) if y is not None else None diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index 63906153bb..7a65023791 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -56,6 +56,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If y is None. """ if y is None: raise ValueError("y cannot be None for PolynomialDetrender!") diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 439bd6b882..0158d58860 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -37,7 +37,7 @@ def allowed_model_families(problem_type): """List the model types allowed for a particular problem type. Args: - problem_types (ProblemTypes or str): ProblemTypes enum or string. + problem_type (ProblemTypes or str): ProblemTypes enum or string. Returns: list[ModelFamily]: A list of model families. @@ -65,6 +65,10 @@ def get_estimators(problem_type, model_families=None): Returns: list[class]: A list of estimator subclasses. + + Raises: + TypeError: If the model_families parameter is not a list. + RuntimeError: If a model family is not valid for the problem type. """ if model_families is not None and not isinstance(model_families, list): raise TypeError("model_families parameter is not a list.") @@ -104,7 +108,7 @@ def handle_component_class(component_class): will return that without modification. Args: - component (str, ComponentBase): Input to be standardized. + component_class (str, ComponentBase): Input to be standardized. Returns: ComponentBase @@ -265,6 +269,9 @@ def generate_component_code(element): Returns: String representation of Python code that can be run separately in order to recreate the component instance. Does not include code for custom component implementation. + + Raises: + ValueError: If the input element is not a component instance. """ # hold the imports needed and add code to end code_strings = [] @@ -295,12 +302,15 @@ def make_balancing_dictionary(y, sampling_ratio): """Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is. Args: - y (pd.Series): Target data - sampling_ratio (float): The balanced ratio we want the samples to meet + y (pd.Series): Target data. + sampling_ratio (float): The balanced ratio we want the samples to meet. Returns: dict: Dictionary where keys are the classes, and the corresponding values are the counts of samples for each class that will satisfy sampling_ratio. + + Raises: + ValueError: If sampling ratio is not in the range (0, 1] or the target is empty. """ if sampling_ratio <= 0 or sampling_ratio > 1: raise ValueError( diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 6a4e25b2ab..198cdf1579 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -132,6 +132,9 @@ def summary(self): """A short summary of the pipeline structure, describing the list of components used. Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder + + Returns: + A string describing the pipeline structure. """ component_graph = [ type(self.component_graph.component_instances[component]) @@ -254,6 +257,7 @@ def compute_estimator_features(self, X, y=None): Args: X (pd.DataFrame): Input data to the pipeline to transform. + y (pd.Series): Target data. Returns: pd.DataFrame: New transformed features. @@ -421,6 +425,9 @@ def graph(self, filepath=None): Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. + + Raises: + RuntimeError: If graphviz is not installed. """ graphviz = import_or_raise( "graphviz", error_msg="Please install graphviz to visualize pipelines." @@ -521,9 +528,6 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): Args: file_path (str): Location to save file. pickle_protocol (int): The pickle data stream format. - - Returns: - None """ with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol) @@ -708,6 +712,9 @@ def inverse_transform(self, y): Args: y (pd.Series): Final component features. + + Returns: + pd.Series: The inverse transform of the target. """ return self.component_graph.inverse_transform(y) diff --git a/evalml/pipelines/pipeline_meta.py b/evalml/pipelines/pipeline_meta.py index e01f740337..2a6913d0a5 100644 --- a/evalml/pipelines/pipeline_meta.py +++ b/evalml/pipelines/pipeline_meta.py @@ -12,8 +12,11 @@ class PipelineBaseMeta(BaseMeta): def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the - wrapped method if `True`. + Args: + method (callable): Method to wrap. + + Raises: + PipelineNotYetFittedError: If pipeline is not yet fitted. """ @wraps(method) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index beb682f510..00236a9a68 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -194,10 +194,13 @@ def make_pipeline( An empty dictionary or None implies using all default values for component parameters. sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. Defaults to None - extra_components (list(ComponentBase)): List of extra components to be added after preprocessing components. Defaults to None. + extra_components (list[ComponentBase]): List of extra components to be added after preprocessing components. Defaults to None. Returns: PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. + + Raises: + ValueError: If estimator is not valid for the given problem type, or sampling is not supported for the given problem type. """ X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 7c99927771..407f51ccde 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -189,8 +189,6 @@ def get_importable_subclasses(base_class, used_in_automl=True): Args: base_class (abc.ABCMeta): Base class to find all of the subclasses for. - args (list): Args used to instantiate the subclass. [{}] for a pipeline, and [] for - all other classes. used_in_automl: Not all components/pipelines/estimators are used in automl search. If True, only include those subclasses that are used in the search. This would mean excluding classes related to ExtraTrees, ElasticNet, and Baseline estimators. @@ -406,9 +404,9 @@ def save_plot( fig (Figure): Figure to be saved. filepath (str or Path, optional): Location to save file. Default is with filename "test_plot". format (str): Extension for figure to be saved as. Ignored if interactive is True and fig - is of type plotly.Figure. Defaults to 'png'. + is of type plotly.Figure. Defaults to 'png'. interactive (bool, optional): If True and fig is of type plotly.Figure, saves the fig as interactive - instead of static, and format will be set to 'html'. Defaults to False. + instead of static, and format will be set to 'html'. Defaults to False. return_filepath (bool, optional): Whether to return the final filepath the image is saved to. Defaults to False. Returns: From 5b15cd674ee61e693ea8c2fcb89e9f5d19634a58 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 16:19:19 -0400 Subject: [PATCH 32/62] more cleanup --- evalml/automl/automl_search.py | 6 +-- evalml/model_family/utils.py | 1 + evalml/model_understanding/force_plots.py | 2 +- evalml/model_understanding/graphs.py | 40 ++++++++++++------- .../binary_classification_objective.py | 8 +++- evalml/objectives/fraud_cost.py | 11 +++-- evalml/objectives/objective_base.py | 13 +++--- evalml/objectives/sensitivity_low_alert.py | 13 ++++-- evalml/objectives/utils.py | 6 +++ .../binary_classification_pipeline_mixin.py | 14 +++---- evalml/pipelines/classification_pipeline.py | 3 ++ evalml/pipelines/component_graph.py | 24 +++++++++-- evalml/pipelines/components/component_base.py | 9 ++++- .../components/component_base_meta.py | 12 +++++- .../classifiers/baseline_classifier.py | 3 ++ .../estimators/classifiers/svm_classifier.py | 3 ++ .../components/estimators/estimator.py | 12 ++++-- .../estimators/regressors/svm_regressor.py | 3 ++ .../transformers/encoders/onehot_encoder.py | 6 +++ .../feature_selection/feature_selector.py | 3 ++ .../transformers/imputers/target_imputer.py | 3 ++ .../preprocessing/datetime_featurizer.py | 4 +- .../transformers/preprocessing/lsa.py | 2 + .../preprocessing/polynomial_detrender.py | 3 ++ .../transformers/samplers/base_sampler.py | 3 ++ .../components/transformers/transformer.py | 12 ++++-- evalml/pipelines/components/utils.py | 6 ++- evalml/pipelines/pipeline_base.py | 7 +++- evalml/pipelines/pipeline_meta.py | 3 ++ evalml/pipelines/regression_pipeline.py | 3 ++ .../time_series_classification_pipelines.py | 7 ++++ .../time_series_regression_pipeline.py | 3 ++ evalml/pipelines/utils.py | 5 ++- evalml/problem_types/utils.py | 13 ++++-- evalml/utils/gen_utils.py | 29 +++++++++++--- evalml/utils/logger.py | 3 ++ evalml/utils/woodwork_utils.py | 3 ++ 37 files changed, 230 insertions(+), 71 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 115d0b7b72..ef78a73155 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -128,9 +128,9 @@ def search( problem_type (str or ProblemTypes): Type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - LogLossBinary for binary classification problems, - - LogLossMulticlass for multiclass classification problems, and - - R2 for regression problems. + - LogLossBinary for binary classification problems, + - LogLossMulticlass for multiclass classification problems, and + - R2 for regression problems. mode (str): mode for DefaultAlgorithm. There are two modes: fast and long, where fast is a subset of long. Please look at DefaultAlgorithm for more details. max_time (int, str): Maximum time to search for pipelines. This will not start a new pipeline search after the duration diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index 3b95b9491d..f23be16388 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -13,6 +13,7 @@ def handle_model_family(model_family): Raises: KeyError: If input is not a valid model family. + ValueError: If input is not a string or ModelFamily object. """ if isinstance(model_family, str): try: diff --git a/evalml/model_understanding/force_plots.py b/evalml/model_understanding/force_plots.py index c129dd07c1..809b7e65b9 100644 --- a/evalml/model_understanding/force_plots.py +++ b/evalml/model_understanding/force_plots.py @@ -14,7 +14,7 @@ def graph_force_plot(pipeline, rows_to_explain, training_data, y, matplotlib=Fal Args: pipeline (PipelineBase): The pipeline to generate the force plot for. - rows_to_explain (list(int)): A list of the indices indicating which of the rows of + rows_to_explain (list[int]): A list of the indices indicating which of the rows of the training_data to explain. training_data (pandas.DataFrame): The data used to train the pipeline. y (pandas.Series): The target data for the pipeline. diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index bb5d34ca5f..43b61453a4 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -46,7 +46,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): Args: y_true (pd.Series or np.ndarray): True binary labels. - y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. + y_predicted (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. Returns: @@ -73,6 +73,9 @@ def normalize_confusion_matrix(conf_mat, normalize_method="true"): Returns: pd.DataFrame: normalized version of the input confusion matrix. The column header represents the predicted labels while row header represents the actual labels. + + Raises: + ValueError: If configuration is invalid, or if the sum of a given axis is zero and normalization by axis is specified. """ conf_mat = infer_feature_types(conf_mat) col_names = conf_mat.columns @@ -196,6 +199,9 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): * `recall`: Recall values. * `thresholds`: Threshold values used to produce the precision and recall. * `auc_score`: The area under the ROC curve. + + Raises: + NoPositiveLabelException: If predicted probabilities do not contain a column at the specified label. """ y_true = infer_feature_types(y_true) y_pred_proba = infer_feature_types(y_pred_proba) @@ -313,11 +319,14 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. - custom_class_labels (list or None): If not None, custom labels for classes. Default None. - title_addition (str or None): if not None, append to plot title. Default None. + custom_class_names (list): If not None, custom labels for classes. Default None. + title_addition (str: if not None, append to plot title. Default None. Returns: - plotly.Figure representing the ROC plot generated + plotly.Figure representing the ROC plot generated. + + Raises: + ValueError: If the number of custom class names does not match number of classes in the input data. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -583,12 +592,12 @@ def partial_dependence( grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value is less than the maximum number of categories present in categorical data within X, it will be set to the max number of categories + 1. Defaults to 100. - kind {'average', 'individual', 'both'}: The type of predictions to return. 'individual' will return the predictions for + kind ({'average', 'individual', 'both'}): The type of predictions to return. 'individual' will return the predictions for all of the points in the grid for each sample in X. 'average' will return the predictions for all of the points in the grid but averaged over all of the samples in X. Returns: - pd.DataFrame, list(pd.DataFrame), or tuple(pd.DataFrame, list(pd.DataFrame)): + pd.DataFrame, list[pd.DataFrame], or tuple(pd.DataFrame, list[pd.DataFrame]): When `kind='average'`: DataFrame with averaged predictions for all points in the grid averaged over all samples of X and the values used to calculate those predictions. @@ -944,8 +953,8 @@ def graph_partial_dependence( the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. - grid_resolution (int): Number of samples of feature(s) for partial dependence plot - kind {'average', 'individual', 'both'}: Type of partial dependence to plot. 'average' creates a regular partial dependence + grid_resolution (int): Number of samples of feature(s) for partial dependence plot. + kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. @@ -1348,13 +1357,10 @@ def visualize_decision_tree( Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. - max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), - tree is fully generated. + max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. rotate (bool, optional): Orient tree left to right rather than top-down. - filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for - regression, or purity of node for multi-output. - filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph - will not be saved. + filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. + filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. @@ -1433,7 +1439,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame + pd.DataFrame: Predictions vs time. """ dates = infer_feature_types(dates) y = infer_feature_types(y) @@ -1552,9 +1558,13 @@ def t_sne( learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + **kwargs: Additional arbitrary arguments. Returns: np.ndarray (n_samples, n_components) + + Raises: + ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index 7e0ce67be5..1a1ead961f 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -20,6 +20,9 @@ def can_optimize_threshold(cls): This will be false for any objective that works directly with predicted probabilities, like log loss and AUC. Otherwise, it will be true. + + Returns: + bool: Whether or not an objective can be optimized. """ return not cls.score_needs_proba @@ -32,7 +35,10 @@ def optimize_threshold(self, ypred_proba, y_true, X=None): X (pd.DataFrame, optional): Any extra columns that are needed from training data. Returns: - Optimal threshold for this objective + Optimal threshold for this objective. + + Raises: + RuntimeError: If objective cannot be optimized. """ ypred_proba = self._standardize_input_type(ypred_proba) y_true = self._standardize_input_type(y_true) diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 29db2d76ab..30fa3d8e65 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -38,13 +38,16 @@ def objective_function(self, y_true, y_predicted, X, sample_weight=None): """Calculate amount lost to fraud per transaction given predictions, true values, and dataframe with transaction amount. Args: - y_predicted (pd.Series): Predicted fraud labels - y_true (pd.Series): True fraud labels - X (pd.DataFrame): Data with transaction amounts + y_predicted (pd.Series): Predicted fraud labels. + y_true (pd.Series): True fraud labels. + X (pd.DataFrame): Data with transaction amounts. sample_weight (pd.DataFrame): Ignored. Returns: - float: Amount lost to fraud per transaction + float: Amount lost to fraud per transaction. + + Raises: + ValueError: If amount_col is not a valid column in the input data. """ X = self._standardize_input_type(X) y_true = self._standardize_input_type(y_true) diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 0db65afca5..dd6fa497eb 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -76,10 +76,7 @@ def objective_function(cls, y_true, y_predicted, X=None, sample_weight=None): @classproperty def positive_only(cls): - """If True, this objective is only valid for positive data. - - Defaults to False. - """ + """If True, this objective is only valid for positive data. Defaults to False.""" return False def score(self, y_true, y_predicted, X=None, sample_weight=None): @@ -128,11 +125,11 @@ def validate_inputs(self, y_true, y_predicted): """Validates the input based on a few simple checks. Args: - y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples] - y_true (pd.Series): Actual class labels of length [n_samples] + y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples]. + y_true (pd.Series): Actual class labels of length [n_samples]. - Returns: - None + Raises: + ValueError: If the inputs are malformed. """ if y_predicted.shape[0] != y_true.shape[0]: raise ValueError( diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index 4c5d16ff22..658dbbf31c 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -32,7 +32,11 @@ def decision_function(self, ypred_proba, **kwargs): """Determine if an observation is high risk given an alert rate. Args: - ypred_proba (pd.Series): Predicted probabilities + ypred_proba (pd.Series): Predicted probabilities. + **kwargs: Additional abritrary parameters. + + Returns: + pd.Series: Whether or not an observation is high risk given an alert rate. """ ypred_proba = self._standardize_input_type(ypred_proba) if len(ypred_proba.unique()) == 1: @@ -50,11 +54,12 @@ def objective_function(self, y_true, y_predicted, **kwargs): """Calculate sensitivity across all predictions, using the top alert_rate percent of observations as the predicted positive class. Args: - y_true (pd.Series): True labels - y_predicted (pd.Series): Predicted labels based on alert_rate + y_true (pd.Series): True labels. + y_predicted (pd.Series): Predicted labels based on alert_rate. + **kwargs: Additional abritrary parameters. Returns: - float: sensitivity using the observations with the top scores as the predicted positive class + float: sensitivity using the observations with the top scores as the predicted positive class. """ y_true = self._standardize_input_type(y_true) y_predicted = self._standardize_input_type(y_predicted) diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index 14cc0f6967..9cd3e7ef08 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -79,6 +79,12 @@ def get_objective(objective, return_instance=False, **kwargs): ObjectiveBase if the parameter objective is of type ObjectiveBase. If objective is instead a valid objective name, function will return the class corresponding to that name. If return_instance is True, an instance of that objective will be returned. + + Raises: + TypeError: If objective is None. + TypeError: If objective is not a string and not an instance of ObjectiveBase. + ObjectiveNotFoundError: If input objective is not a valid objective. + ObjectiveCreationError: If objective cannot be created properly. """ if objective is None: raise TypeError("Objective parameter cannot be NoneType") diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 494d1c0883..9090e48e57 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -8,10 +8,7 @@ class BinaryClassificationPipelineMixin: @property def threshold(self): - """Threshold used to make a prediction. - - Defaults to None. - """ + """Threshold used to make a prediction. Defaults to None.""" return self._threshold @threshold.setter @@ -48,10 +45,13 @@ def optimize_threshold(self, X, y, y_pred_proba, objective): """Optimize the pipeline threshold given the objective to use. Only used for binary problems with objectives whose thresholds can be tuned. Args: - X (pd.DataFrame): Input features - y (pd.Series): Input target values - y_pred_proba (pd.Series): The predicted probabilities of the target outputted by the pipeline + X (pd.DataFrame): Input features. + y (pd.Series): Input target values. + y_pred_proba (pd.Series): The predicted probabilities of the target outputted by the pipeline. objective (ObjectiveBase): The objective to threshold with. Must have a tunable threshold. + + Raises: + ValueError: If objective is not optimizable. """ if self.can_tune_threshold_with_objective(objective): targets = self._encode_targets(y) diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 1716bf6416..3b43085af5 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -114,6 +114,9 @@ def predict_proba(self, X): Returns: pd.DataFrame: Probability estimates + + Raises: + ValueError: If final component is not an estimator. """ if self.estimator is None: raise ValueError( diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 991c1876f3..fecdb6fe7c 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -140,8 +140,14 @@ def instantiate(self, parameters): Args: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. - An empty dictionary {} or None implies using all default values for component parameters. If a component - in the component graph is already instantiated, it will not use any of its parameters defined in this dictionary. + An empty dictionary {} or None implies using all default values for component parameters. If a component + in the component graph is already instantiated, it will not use any of its parameters defined in this dictionary. + + Returns: + self + + Raises: + ValueError: If component graph is already instantiated or if a component errored while instantiating. """ if self._is_instantiated: raise ValueError( @@ -179,6 +185,9 @@ def fit(self, X, y): Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. + + Returns: + self """ X = infer_feature_types(X) y = infer_feature_types(y) @@ -266,6 +275,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed output. + + Raises: + ValueError: If final component is not a Transformer. """ if len(self.compute_order) == 0: return infer_feature_types(X) @@ -291,6 +303,9 @@ def predict(self, X): Returns: pd.Series: Predicted values. + + Raises: + ValueError: If final component is not an Estimator. """ if len(self.compute_order) == 0: return infer_feature_types(X) @@ -499,7 +514,7 @@ def get_inputs(self, component_name): list[str]: List of inputs for the component to use. Raises: - KeyError: If the component is not in the graph. + ValueError: If the component is not in the graph. """ try: component_info = self.component_dict[component_name] @@ -736,6 +751,9 @@ def inverse_transform(self, y): Args: y: (pd.Series): Final component features. + + Returns: + pd.Series: The target with inverse transformation applied. """ data_to_transform = infer_feature_types(y) current_component = self.compute_order[-1] diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 35819265da..91ddd6bf40 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -77,8 +77,10 @@ def modifies_target(cls): def needs_fitting(self): """Returns boolean determining if component needs fitting before calling predict, predict_proba, transform, or feature_importances. - This can be overridden to False for components that do not need - to be fit or whose fit methods do nothing. + This can be overridden to False for components that do not need to be fit or whose fit methods do nothing. + + Returns: + True. """ return True @@ -122,6 +124,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + MethodPropertyNotFoundError: If component does not have a fit method or a component_obj that implements fit. """ X = infer_feature_types(X) if y is not None: diff --git a/evalml/pipelines/components/component_base_meta.py b/evalml/pipelines/components/component_base_meta.py index af7c2bbce4..08ed0e33cb 100644 --- a/evalml/pipelines/components/component_base_meta.py +++ b/evalml/pipelines/components/component_base_meta.py @@ -12,8 +12,16 @@ class ComponentBaseMeta(BaseMeta): def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the - wrapped method if `True`. + It raises an exception if `False` and calls and returns the wrapped method if `True`. + + Args: + method (callable): Method to wrap. + + Returns: + The wrapped method. + + Raises: + ComponentNotYetFittedError: If component is not yet fitted. """ @wraps(method) diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 5ff42a4a33..e4b3385441 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -51,6 +51,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If y is None. """ if y is None: raise ValueError("Cannot fit Baseline classifier if y is None") diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index efc896dec6..03fb6c8ea2 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -74,6 +74,9 @@ def feature_importance(self): """Feature importance only works with linear kernels. If the kernel isn't linear, we return a numpy array of zeros. + + Returns: + Feature importance of fitted SVM classifier or a numpy array of zeroes if the kernel is not linear. """ if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 5425347f3c..c136590c46 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -76,13 +76,13 @@ def predict(self, X): """Make predictions using selected features. Args: - X (pd.DataFrame): Data of shape [n_samples, n_features] + X (pd.DataFrame): Data of shape [n_samples, n_features]. Returns: pd.Series: Predicted values. Raises: - AttributeError: If estimator does not have a predict method or a component_obj that implements predict. + MethodPropertyNotFoundError: If estimator does not have a predict method or a component_obj that implements predict. """ try: X = infer_feature_types(X) @@ -99,10 +99,13 @@ def predict_proba(self, X): """Make probability estimates for labels. Args: - X (pd.DataFrame, or np.ndarray): Features + X (pd.DataFrame): Features. Returns: pd.Series: Probability estimates. + + Raises: + MethodPropertyNotFoundError: If estimator does not have a predict_proba method or a component_obj that implements predict_proba. """ try: X = infer_feature_types(X) @@ -119,6 +122,9 @@ def feature_importance(self): Returns: np.ndarray: Importance associated with each feature. + + Raises: + MethodPropertyNotFoundError: If estimator does not have a feature_importance method or a component_obj that implements feature_importance. """ try: return self._component_obj.feature_importances_ diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index 08a9211e0e..b07212faec 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -58,6 +58,9 @@ def feature_importance(self): """Feature importance of fitted SVM regresor. Only works with linear kernels. If the kernel isn't linear, we return a numpy array of zeros. + + Returns: + The feature importance of the fitted SVM regressor, or an array of zeroes if the kernel is not linear. """ if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index fe59199bae..3b3945ca85 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -101,6 +101,9 @@ def fit(self, X, y=None): Returns: self + + Raises: + ValueError: If encoding a column failed. """ top_n = self.parameters["top_n"] X = infer_feature_types(X) @@ -216,6 +219,9 @@ def categories(self, feature_name): Returns: np.ndarray: The unique categories, in the same dtype as they were provided during fit. + + Raises: + ValueError: If feature was not provided to one-hot encoder as a training feature. """ try: index = self.features_to_encode.index(feature_name) diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index d1ebc53e67..b5c5c90ea6 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -42,6 +42,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X + + Raises: + MethodPropertyNotFoundError: If feature selector does not have a transform method or a component_obj that implements transform """ X_ww = infer_feature_types(X) self.input_feature_names = list(X_ww.columns.values) diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index 8e805061bf..7dcb207df7 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -25,6 +25,9 @@ def check_for_fit(cls, method): Raises: ComponentNotYetFittedError: If component is not fitted. + + Returns: + The wrapped input method. """ @wraps(method) diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index 434f04d4c9..a1589375e2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -157,8 +157,8 @@ def get_feature_names(self): """Gets the categories of each datetime feature. Returns: - dict: Dictionary, where each key-value pair is a column name and a dictionary - mapping the unique feature values to their integer encoding. + dict: Dictionary, where each key-value pair is a column name and a dictionary + mapping the unique feature values to their integer encoding. """ return self._categories diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 5464dba86a..af7ec69c10 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -35,6 +35,8 @@ def fit(self, X, y=None): X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. + Returns: + self """ X = infer_feature_types(X) self._text_columns = self._get_text_columns(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index 7a65023791..00efb05e0c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -106,6 +106,9 @@ def inverse_transform(self, y): Returns: tuple of pd.DataFrame, pd.Series: The first element are the input features returned without modification. The second element is the target variable y with the trend added back. + + Raises: + ValueError: If y is None. """ if y is None: raise ValueError("y cannot be None for PolynomialDetrender!") diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 7dced470f7..b489d4f23c 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -27,6 +27,9 @@ def fit(self, X, y): Returns: self + + Raises: + ValueError: If y is None. """ if y is None: raise ValueError("y cannot be None") diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index c66595d480..0d853a2fb5 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -43,6 +43,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X + + Raises: + MethodPropertyNotFoundError: If transformer does not have a transform method or a component_obj that implements transform. """ X_ww = infer_feature_types(X) if y is not None: @@ -62,11 +65,14 @@ def fit_transform(self, X, y=None): """Fits on X and transforms X. Args: - X (pd.DataFrame): Data to fit and transform - y (pd.Series): Target data + X (pd.DataFrame): Data to fit and transform. + y (pd.Series): Target data. Returns: - pd.DataFrame: Transformed X + pd.DataFrame: Transformed X. + + Raises: + MethodPropertyNotFoundError: If transformer does not have a transform method or a component_obj that implements transform. """ X_ww = infer_feature_types(X) if y is not None: diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 2e77ab27c2..d3b1ba00a0 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -110,6 +110,10 @@ def handle_component_class(component_class): Returns: ComponentBase + + Raises: + ValueError: If input is not a valid component class. + MissingComponentError: If the component cannot be found. """ if isinstance(component_class, ComponentBase) or ( inspect.isclass(component_class) and issubclass(component_class, ComponentBase) @@ -118,7 +122,7 @@ def handle_component_class(component_class): if not isinstance(component_class, str): raise ValueError( ( - "component_graph may only contain str or ComponentBase subclasses, not '{}'" + "component_class may only contain str or ComponentBase subclasses, not '{}'" ).format(type(component_class)) ) component_classes = {component.name: component for component in all_components()} diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 1a58d4e140..c7eabfd8a2 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -430,6 +430,7 @@ def graph(self, filepath=None): Raises: RuntimeError: If graphviz is not installed. + ValueError: If path is not writeable. """ graphviz = import_or_raise( "graphviz", error_msg="Please install graphviz to visualize pipelines." @@ -484,7 +485,10 @@ def graph_feature_importance(self, importance_threshold=0): importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. Returns: - plotly.Figure: A bar graph showing features and their corresponding importance + plotly.Figure: A bar graph showing features and their corresponding importance. + + Raises: + ValueError: If importance threshold is not valid. """ go = import_or_raise( "plotly.graph_objects", @@ -695,7 +699,6 @@ def can_tune_threshold_with_objective(self, objective): """Determine whether the threshold of a binary classification pipeline can be tuned. Args: - pipeline (PipelineBase): Binary classification pipeline. objective (ObjectiveBase): Primary AutoMLSearch objective. Returns: diff --git a/evalml/pipelines/pipeline_meta.py b/evalml/pipelines/pipeline_meta.py index 2a6913d0a5..c6a00c3aef 100644 --- a/evalml/pipelines/pipeline_meta.py +++ b/evalml/pipelines/pipeline_meta.py @@ -15,6 +15,9 @@ def check_for_fit(cls, method): Args: method (callable): Method to wrap. + Returns: + The wrapped method. + Raises: PipelineNotYetFittedError: If pipeline is not yet fitted. """ diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index eafd8d149d..f13acf4527 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -31,6 +31,9 @@ def fit(self, X, y): Returns: self + + Raises: + ValueError: If the target is not numeric. """ X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 26988e1f08..a9b0d56af0 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -79,6 +79,9 @@ def predict(self, X, y=None, objective=None): Returns: pd.Series: Predicted values. + + Raises: + ValueError: If final component is not an Estimator. """ if self.estimator is None: raise ValueError( @@ -101,9 +104,13 @@ def predict_proba(self, X, y=None): Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. + y (pd.Series): Target data. Returns: pd.DataFrame: Probability estimates. + + Raises: + ValueError: If final component is not an Estimator. """ if self.estimator is None: raise ValueError( diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 81773ca271..f1a33ffbbf 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -37,6 +37,9 @@ def predict(self, X, y=None, objective=None): Returns: pd.Series: Predicted values. + + Raises: + ValueError: If final component is not an estimator. """ if self.estimator is None: raise ValueError( diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index ec897d06f4..5f73952056 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -233,11 +233,14 @@ def generate_pipeline_code(element): """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline. Args: - element (pipeline instance): The instance of the pipeline to generate string Python code + element (pipeline instance): The instance of the pipeline to generate string Python code. Returns: str: String representation of Python code that can be run separately in order to recreate the pipeline instance. Does not include code for custom component implementation. + + Raises: + ValueError: If element is not a pipeline, or if the pipeline is nonlinear. """ # hold the imports needed and add code to end code_strings = [] diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index efafbcdb7b..26926931b1 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -9,10 +9,14 @@ def handle_problem_types(problem_type): """Handles problem_type by either returning the ProblemTypes or converting from a str. Args: - problem_type (str or ProblemTypes): Problem type that needs to be handled + problem_type (str or ProblemTypes): Problem type that needs to be handled. Returns: - ProblemTypes + ProblemTypes enum + + Raises: + KeyError: If input is not a valid ProblemTypes enum value. + ValueError: If input is not a string or ProblemTypes object. """ if isinstance(problem_type, str): try: @@ -31,7 +35,7 @@ def detect_problem_type(y): """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) Ignores missing and null data. Args: - y (pd.Series): the target labels to predict + y (pd.Series): The target labels to predict. Returns: ProblemType: ProblemType Enum @@ -40,6 +44,9 @@ def detect_problem_type(y): >>> y = pd.Series([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1]) >>> problem_type = detect_problem_type(y) >>> assert problem_type == ProblemTypes.BINARY + + Raises: + ValueError: If the input has less than two classes. """ y = pd.Series(y).dropna() num_classes = y.nunique() diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index e035d3a83e..9db8d8cafb 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -22,9 +22,16 @@ def import_or_raise(library, error_msg=None, warning=False): """Attempts to import the requested library by name. If the import fails, raises an ImportError or warning. Args: - library (str): the name of the library - error_msg (str): error message to return if the import fails - warning (bool): if True, import_or_raise gives a warning instead of ImportError. Defaults to False. + library (str): The name of the library. + error_msg (str): Rrror message to return if the import fails. + warning (bool): If True, import_or_raise gives a warning instead of ImportError. Defaults to False. + + Returns: + Returns the library if importing succeeded. + + Raises: + ImportError: If attempting to import the library fails because the library is not installed. + Exception: If importing the library fails. """ try: return importlib.import_module(library) @@ -77,7 +84,13 @@ def get_random_state(seed): """Generates a numpy.random.RandomState instance using seed. Args: - seed (None, int, np.random.RandomState object): seed to use to generate numpy.random.RandomState. Must be between SEED_BOUNDS.min_bound and SEED_BOUNDS.max_bound, inclusive. Otherwise, an exception will be thrown. + seed (None, int, np.random.RandomState object): seed to use to generate numpy.random.RandomState. Must be between SEED_BOUNDS.min_bound and SEED_BOUNDS.max_bound, inclusive. + + Raises: + ValueError: If the input seed is not within the acceptable range. + + Returns: + A numpy.random.RandomState instance. """ if isinstance(seed, (int, np.integer)) and ( seed < SEED_BOUNDS.min_bound or SEED_BOUNDS.max_bound < seed @@ -103,7 +116,10 @@ def get_random_seed( max_bound (None, int): if not default of None, will be max bound when generating seed (exclusive). Must be greater than min_bound. Returns: - int: seed for random number generator + int: Seed for random number generator + + Raises: + ValueError: If boundaries are not valid. """ if not min_bound < max_bound: raise ValueError( @@ -298,6 +314,7 @@ def pad_with_nans(pd_data, num_to_pad): Args: pd_data (pd.DataFrame or pd.Series): Data to pad. + num_to_pad (int): Number of nans to pad. Returns: pd.DataFrame or pd.Series @@ -347,7 +364,7 @@ def drop_rows_with_nans(*pd_data): """Drop rows that have any NaNs in all dataframes or series. Args: - *pd_data (sequence of pd.Series or pd.DataFrame or None) + *pd_data: sequence of pd.Series or pd.DataFrame or None Returns: list of pd.DataFrame or pd.Series or None diff --git a/evalml/utils/logger.py b/evalml/utils/logger.py index 95dff24666..b12e1dfeee 100644 --- a/evalml/utils/logger.py +++ b/evalml/utils/logger.py @@ -9,6 +9,9 @@ def get_logger(name): Args: name (str): Name of the logger to get. + + Returns: + The logger object with the associated name. """ logger = logging.getLogger(name) if not len(logger.handlers): diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 7a7e4fb256..e711a271e0 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -56,6 +56,9 @@ def infer_feature_types(data, feature_types=None): Returns: A Woodwork data structure where the data type of each column was either specified or inferred. + + Raises: + ValueError: If there is a mismatch between the dataframe and the woodwork schema. """ if isinstance(data, list): data = _list_to_pandas(data) From fd439346e92ba48c8b599e60dcc19da8b07f1d57 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 17:45:35 -0400 Subject: [PATCH 33/62] clean up --- Makefile | 7 +- evalml/automl/engine/cf_engine.py | 1 + evalml/automl/engine/dask_engine.py | 2 + evalml/automl/engine/sequential_engine.py | 1 + evalml/model_understanding/graphs.py | 83 +++++++++++++++-------- 5 files changed, 61 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 16a7f6ca4d..2523e791b7 100644 --- a/Makefile +++ b/Makefile @@ -8,10 +8,11 @@ clean: .PHONY: lint lint: - flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions - pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' - darglint evalml/ + isort --check-only evalml + python docs/notebook_version_standardizer.py check-versions black evalml -t py39 --check + pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' + find evalml -type f -not -path "evalml/tests/*" -a -name "*.py" | xargs flake8 .PHONY: lint-fix lint-fix: diff --git a/evalml/automl/engine/cf_engine.py b/evalml/automl/engine/cf_engine.py index c6fd236227..5317436906 100644 --- a/evalml/automl/engine/cf_engine.py +++ b/evalml/automl/engine/cf_engine.py @@ -38,6 +38,7 @@ def close(self): @property def is_closed(self): + """Property that determines whether the Engine's Client's resources are closed.""" if isinstance(self.pool, ProcessPoolExecutor): return self.pool._shutdown_thread elif isinstance(self.pool, ThreadPoolExecutor): diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index b045b079de..eb40c8e816 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -67,9 +67,11 @@ def __init__(self, cluster=None): self._data_futures_cache = {} def __enter__(self): + """Enter runtime context.""" return self def __exit__(self, exc_type, exc_val, exc_tb): + """Exit runtime context.""" self.close() def send_data_to_cluster(self, X, y): diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 3bbc23c4a1..f44fc04bdf 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -126,4 +126,5 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): return computation def close(self): + """No-op.""" pass diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 43b61453a4..3b16e71105 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -319,8 +319,8 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. - custom_class_names (list): If not None, custom labels for classes. Default None. - title_addition (str: if not None, append to plot title. Default None. + custom_class_names (list): If not None, custom labels for classes. Defaults to None. + title_addition (str): if not None, append to plot title. Defaults to None. Returns: plotly.Figure representing the ROC plot generated. @@ -379,14 +379,17 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold """Generate a bar graph of the pipeline's permutation importance. Args: - pipeline (PipelineBase or subclass): Fitted pipeline - X (pd.DataFrame): The input data used to score and compute permutation importance - y (pd.Series): The target data - objective (str, ObjectiveBase): Objective to score on + pipeline (PipelineBase or subclass): Fitted pipeline. + X (pd.DataFrame): The input data used to score and compute permutation importance. + y (pd.Series): The target data. + objective (str, ObjectiveBase): Objective to score on. importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. + + Raises: + ValueError: If importance_threshold is not greater than or equal to 0. """ go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -438,14 +441,18 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Compute objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. Args: - pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline - X (pd.DataFrame): The input data used to compute objective score - y (pd.Series): The target labels - objective (ObjectiveBase obj, str): Objective used to score - steps (int): Number of intervals to divide and calculate objective score at + pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline. + X (pd.DataFrame): The input data used to compute objective score. + y (pd.Series): The target labels. + objective (ObjectiveBase obj, str): Objective used to score. + steps (int): Number of intervals to divide and calculate objective score at. Returns: - pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold + pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold. + + Raises: + ValueError: If objective is not a binary classification objective. + ValueError: If objective's `score_needs_proba` is not False. """ objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(ProblemTypes.BINARY): @@ -470,14 +477,14 @@ def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Generate a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. Args: - pipeline (PipelineBase or subclass): Fitted pipeline - X (pd.DataFrame): The input data used to score and compute scores - y (pd.Series): The target labels - objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph - steps (int): Number of intervals to divide and calculate objective score at + pipeline (PipelineBase or subclass): Fitted pipeline. + X (pd.DataFrame): The input data used to score and compute scores. + y (pd.Series): The target labels. + objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph. + steps (int): Number of intervals to divide and calculate objective score at. Returns: - plotly.Figure representing the objective score vs. threshold graph generated + plotly.Figure representing the objective score vs. threshold graph generated. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -597,7 +604,7 @@ def partial_dependence( the grid but averaged over all of the samples in X. Returns: - pd.DataFrame, list[pd.DataFrame], or tuple(pd.DataFrame, list[pd.DataFrame]): + pd.DataFrame, list[pd.DataFrame], or tuple[pd.DataFrame, list[pd.DataFrame]]: When `kind='average'`: DataFrame with averaged predictions for all points in the grid averaged over all samples of X and the values used to calculate those predictions. @@ -620,12 +627,14 @@ def partial_dependence( feature value pair. Raises: - PartialDependenceError: if the user provides a tuple of not exactly two features. - PartialDependenceError: if the provided pipeline isn't fitted. - PartialDependenceError: if the provided pipeline is a Baseline pipeline. - PartialDependenceError: if any of the features passed in are completely NaN - PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper + PartialDependenceError: If the user provides a tuple of not exactly two features. + PartialDependenceError: If the provided pipeline isn't fitted. + PartialDependenceError: If the provided pipeline is a Baseline pipeline. + PartialDependenceError: If any of the features passed in are completely NaN. + PartialDependenceError: If any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. + ValueError: Error during call to scikit-learn's partial dependence method. + Exception: All other errors during calculation. """ try: # Dynamically set the grid resolution to the maximum number of values @@ -1189,13 +1198,16 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None + Defaults to None. Returns: pd.DataFrame with the following columns: * `prediction`: Predicted values from regression model. * `actual`: Real target values. * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. + + Raises: + ValueError: If threshold is not positive. """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1228,10 +1240,13 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None + Defaults to None. Returns: - plotly.Figure representing the predicted vs. actual values graph + plotly.Figure representing the predicted vs. actual values graph. + + Raises: + ValueError: If threshold is not positive. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1310,7 +1325,11 @@ def decision_tree_data_from_estimator(estimator): estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1327,13 +1346,17 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): - """Return data for a fitted pipeline with in a restructured format. + """Return data for a fitted pipeline with in a restructured format. Args: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If input pipeline is not a decision tree model. + NotFittedError: If pipeline is not fitted. """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1439,7 +1462,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame: Predictions vs time. + pd.DataFrame: Predictions vs time. """ dates = infer_feature_types(dates) y = infer_feature_types(y) From 2c0c74a506d2e45d2d7817280d208ae5bf72bdd0 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 21:20:03 -0400 Subject: [PATCH 34/62] linting --- evalml/automl/automl_search.py | 8 +-- evalml/model_understanding/graphs.py | 60 +++++++++---------- .../time_series_classification_pipelines.py | 23 ++++--- evalml/pipelines/time_series_pipeline_base.py | 20 +++++-- 4 files changed, 62 insertions(+), 49 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 7195015832..e8716ec128 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -128,9 +128,9 @@ def search( problem_type (str or ProblemTypes): Type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. objective (str, ObjectiveBase): The objective to optimize for. Used to propose and rank pipelines, but not for optimizing each pipeline during fit-time. When set to 'auto', chooses: - - LogLossBinary for binary classification problems, - - LogLossMulticlass for multiclass classification problems, and - - R2 for regression problems. + - LogLossBinary for binary classification problems, + - LogLossMulticlass for multiclass classification problems, and + - R2 for regression problems. mode (str): mode for DefaultAlgorithm. There are two modes: fast and long, where fast is a subset of long. Please look at DefaultAlgorithm for more details. max_time (int, str): Maximum time to search for pipelines. This will not start a new pipeline search after the duration @@ -225,7 +225,7 @@ def search_iterative( - LogLossMulticlass for multiclass classification problems, and - R2 for regression problems. problem_configuration (dict): Additional parameters needed to configure the search. For example, - in time series problems, values should be passed in for the date_index, gap, forecast_horizon, and max_delay variables. + in time series problems, values should be passed in for the date_index, gap, forecast_horizon, and max_delay variables. **kwargs: Other keyword arguments which are provided will be passed to AutoMLSearch. Returns: diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 3b16e71105..34c80b1c5f 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -113,10 +113,10 @@ def graph_confusion_matrix( y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. - title_addition (str or None): if not None, append to plot title. Defaults to None. + title_addition (str or None): If not None, append to plot title. Defaults to None. Returns: - plotly.Figure representing the confusion matrix plot generated + plotly.Figure representing the confusion matrix plot generated. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -194,11 +194,10 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): Returns: list: Dictionary containing metrics used to generate a precision-recall plot, with the following keys: - - * `precision`: Precision values. - * `recall`: Recall values. - * `thresholds`: Threshold values used to produce the precision and recall. - * `auc_score`: The area under the ROC curve. + * `precision`: Precision values. + * `recall`: Recall values. + * `thresholds`: Threshold values used to produce the precision and recall. + * `auc_score`: The area under the ROC curve. Raises: NoPositiveLabelException: If predicted probabilities do not contain a column at the specified label. @@ -231,7 +230,7 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. - title_addition (str or None): If not None, append to plot title. Default None. + title_addition (str or None): If not None, append to plot title. Defaults to None. Returns: plotly.Figure representing the precision-recall plot generated @@ -274,10 +273,10 @@ def roc_curve(y_true, y_pred_proba): Returns: list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary. Each dictionary contains metrics used to generate an ROC plot with the following keys: - * `fpr_rate`: False positive rate. - * `tpr_rate`: True positive rate. - * `threshold`: Threshold values used to produce each pair of true/false positive rates. - * `auc_score`: The area under the ROC curve. + * `fpr_rate`: False positive rate. + * `tpr_rate`: True positive rate. + * `threshold`: Threshold values used to produce each pair of true/false positive rates. + * `auc_score`: The area under the ROC curve. """ y_true = infer_feature_types(y_true).to_numpy() y_pred_proba = infer_feature_types(y_pred_proba).to_numpy() @@ -320,7 +319,7 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. custom_class_names (list): If not None, custom labels for classes. Defaults to None. - title_addition (str): if not None, append to plot title. Defaults to None. + title_addition (str): If not None, append to plot title. Defaults to None. Returns: plotly.Figure representing the ROC plot generated. @@ -591,9 +590,9 @@ def partial_dependence( X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - If features is an int, it must be the index of the feature to use. - If features is a string, it must be a valid column name in X. - If features is a tuple of int/strings, it must contain valid column integers/names in X. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of int/strings, it must contain valid column integers/names in X. percentiles (tuple[float]): The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. Defaults to (0.05, 0.95). grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value @@ -631,8 +630,7 @@ def partial_dependence( PartialDependenceError: If the provided pipeline isn't fitted. PartialDependenceError: If the provided pipeline is a Baseline pipeline. PartialDependenceError: If any of the features passed in are completely NaN. - PartialDependenceError: If any of the features are low-variance. Defined as having one value occurring more than the upper - percentile passed by the user. By default 95%. + PartialDependenceError: If any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. ValueError: Error during call to scikit-learn's partial dependence method. Exception: All other errors during calculation. """ @@ -955,24 +953,24 @@ def graph_partial_dependence( X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - If features is an int, it must be the index of the feature to use. - If features is a string, it must be a valid column name in X. - If features is a tuple of strings, it must contain valid column int/names in X. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of strings, it must contain valid column int/names in X. class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. grid_resolution (int): Number of samples of feature(s) for partial dependence plot. kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence - (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a - single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. + (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a + single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. Returns: plotly.graph_objects.Figure: figure object containing the partial dependence data for plotting Raises: - PartialDependenceError: if a graph is requested for a class name that isn't present in the pipeline. - PartialDependenceError: if an ICE plot is requested for a two-way partial dependence. + PartialDependenceError: If a graph is requested for a class name that isn't present in the pipeline. + PartialDependenceError: If an ICE plot is requested for a two-way partial dependence. """ X = infer_feature_types(X) if isinstance(features, (list, tuple)): @@ -1197,14 +1195,13 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): y_true (pd.Series, or np.ndarray): The real target values of the data y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference - between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None. + between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. Defaults to None. Returns: pd.DataFrame with the following columns: - * `prediction`: Predicted values from regression model. - * `actual`: Real target values. - * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. + * `prediction`: Predicted values from regression model. + * `actual`: Real target values. + * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. Raises: ValueError: If threshold is not positive. @@ -1239,8 +1236,7 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): y_true (pd.Series): The real target values of the data y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference - between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None. + between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. Defaults to None. Returns: plotly.Figure representing the predicted vs. actual values graph. diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index f72eaf87e6..086139c973 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -65,6 +65,9 @@ def predict_proba_in_sample(self, X_holdout, y_holdout, X_train, y_train): Returns: pd.Series: Estimated probabilities. + + Raises: + ValueError: If the final component is not an Estimator. """ if self.estimator is None: raise ValueError( @@ -89,14 +92,17 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None): """Predict on future data where the target is known, e.g. cross validation. Args: - X_holdout (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features]. - y_holdout (pd.Series, np.ndarray): Future target of shape [n_samples]. + X (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features]. + y (pd.Series, np.ndarray): Future target of shape [n_samples]. X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features]. y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train]. objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional. Returns: pd.Series: Estimated labels. + + Raises: + ValueError: If final component is not an Estimator. """ if self.estimator is None: raise ValueError( @@ -213,14 +219,17 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None): """Predict on future data where the target is known, e.g. cross validation. Args: - X_holdout (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features]. - y_holdout (pd.Series, np.ndarray): Future target of shape [n_samples]. - X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. - y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train]. - objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional. + X (pd.DataFrame): Future data of shape [n_samples, n_features]. + y (pd.Series): Future target of shape [n_samples]. + X_train (pd.DataFrame): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. + y_train (pd.Series): Targets used to train the pipeline of shape [n_samples_train]. + objective (ObjectiveBase, str): Objective used to threshold predicted probabilities, optional. Returns: pd.Series: Estimated labels. + + Raises: + ValueError: If objective is not defined for time-series binary classification problems. """ if objective is not None: objective = get_objective(objective, return_instance=True) diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 775b36112f..75b62fa813 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -171,14 +171,17 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None): """Predict on future data where the target is known, e.g. cross validation. Args: - X_holdout (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features] - y_holdout (pd.Series, np.ndarray): Future target of shape [n_samples] + X (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features] + y (pd.Series, np.ndarray): Future target of shape [n_samples] X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures] y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train] objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional. Returns: - pd.Series: Estimated labels + pd.Series: Estimated labels. + + Raises: + ValueError: If final component is not an Estimator. """ if self.estimator is None: raise ValueError( @@ -203,10 +206,15 @@ def predict(self, X, objective=None, X_train=None, y_train=None): Args: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. - objective (str, ObjectiveBase): Used in classification problems to threshold the predictions. objective (Object or string): The objective to use to make predictions. - X_train (pd.DataFrame or np.ndarray or None): Training data. Ignored. Only used for time series. - y_train (pd.Series or None): Training labels. Ignored. Only used for time series. + X_train (pd.DataFrame or np.ndarray or None): Training data. + y_train (pd.Series or None): Training labels. + + Raises: + ValueError: If final component is not an Estimator. + + Returns: + Predictions. """ X_train, y_train = self._convert_to_woodwork(X_train, y_train) if self.estimator is None: From e4bd69390a112314de227223f8b3e23ea8b8b417 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 9 Sep 2021 21:47:47 -0400 Subject: [PATCH 35/62] try rtd --- evalml/model_understanding/force_plots.py | 2 +- evalml/model_understanding/graphs.py | 28 ++++++++----------- .../permutation_importance.py | 3 +- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/evalml/model_understanding/force_plots.py b/evalml/model_understanding/force_plots.py index 809b7e65b9..63ef11846a 100644 --- a/evalml/model_understanding/force_plots.py +++ b/evalml/model_understanding/force_plots.py @@ -58,7 +58,7 @@ def gen_force_plot(shap_values, training_data, expected_value, matplotlib): def force_plot(pipeline, rows_to_explain, training_data, y): - r"""Function to generate the data required to build a force plot. + """Function to generate the data required to build a force plot. Args: pipeline (PipelineBase): The pipeline to generate the force plot for. diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 34c80b1c5f..c5cd66c5b7 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -517,14 +517,11 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): - """If the user is doing a two-way partial dependence plot and one of the features is categorical, we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. - - This is because in the two-way grid calculation, sklearn will try to - coerce every element of the grid to the type of the first feature in - the tuple. If we put the categorical feature first, the grid will be - of type 'object' which can accommodate both categorical and numeric - data. If we put the numeric feature first, the grid will be of type - float64 and we can't coerce categoricals to float64 dtype. + """If the user is doing a two-way partial dependence plot and one of the features is categorical, we ensure that the categorical feature is the first element in the tuple that's passed to sklearn. + + This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the type of the first feature in + the tuple. If we put the categorical feature first, the grid will be of type 'object' which can accommodate both categorical and numeric + data. If we put the numeric feature first, the grid will be of type float64 and we can't coerce categoricals to float64 dtype. """ new_features = features if first_feature_categorical else (features[1], features[0]) return new_features @@ -582,17 +579,16 @@ def partial_dependence( If a single integer or string is given for features, one-way partial dependence is calculated. If a tuple of two integers or strings is given, two-way partial dependence - is calculated with the first feature in the y-axis and second feature in the - x-axis. + is calculated with the first feature in the y-axis and second feature in the x-axis. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of int/strings, it must contain valid column integers/names in X. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of int/strings, it must contain valid column integers/names in X. percentiles (tuple[float]): The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. Defaults to (0.05, 0.95). grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value @@ -953,9 +949,9 @@ def graph_partial_dependence( X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of strings, it must contain valid column int/names in X. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of strings, it must contain valid column int/names in X. class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index 80f49c140f..7328a5c02f 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -238,8 +238,7 @@ def _slow_permutation_importance( ): """If `col_name` is not None, calculates permutation importance for only the column with that name. - Otherwise, calculates the permutation importance for all columns in - the input dataframe. + Otherwise, calculates the permutation importance for all columns in the input dataframe. """ baseline_score = _slow_scorer(pipeline, X, y, objective) if col_name is None: From b58e3ed40ffe1cae23fa272cf367841d53b6b327 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 00:35:00 -0400 Subject: [PATCH 36/62] try --- evalml/model_understanding/graphs.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index c5cd66c5b7..14f8a1a804 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -585,10 +585,11 @@ def partial_dependence( pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. - features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of int/strings, it must contain valid column integers/names in X. + features (int, string, tuple[int or string]): The target feature for which + to create the partial dependence plot for. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of int/strings, it must contain valid column integers/names in X. percentiles (tuple[float]): The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. Defaults to (0.05, 0.95). grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value @@ -946,12 +947,12 @@ def graph_partial_dependence( Args: pipeline (PipelineBase or subclass): Fitted pipeline. - X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at. - features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of strings, it must contain valid column int/names in X. + X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. + features (int, string, tuple[int or string]): The target feature for which to + create the partial dependence plot for. + - If features is an int, it must be the index of the feature to use. + - If features is a string, it must be a valid column name in X. + - If features is a tuple of strings, it must contain valid column int/names in X. class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will @@ -962,7 +963,7 @@ def graph_partial_dependence( single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. Returns: - plotly.graph_objects.Figure: figure object containing the partial dependence data for plotting + plotly.graph_objects.Figure: Figure object containing the partial dependence data for plotting. Raises: PartialDependenceError: If a graph is requested for a class name that isn't present in the pipeline. From 76b78796c9b93c79cb0fbe3bab46c609e9c63882 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 10:12:07 -0400 Subject: [PATCH 37/62] try without bullets --- evalml/model_understanding/graphs.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 14f8a1a804..2715aaafb4 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -587,9 +587,9 @@ def partial_dependence( for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of int/strings, it must contain valid column integers/names in X. + If features is an int, it must be the index of the feature to use. + If features is a string, it must be a valid column name in X. + If features is a tuple of int/strings, it must contain valid column integers/names in X. percentiles (tuple[float]): The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. Defaults to (0.05, 0.95). grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value @@ -950,9 +950,9 @@ def graph_partial_dependence( X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. - - If features is an int, it must be the index of the feature to use. - - If features is a string, it must be a valid column name in X. - - If features is a tuple of strings, it must contain valid column int/names in X. + If features is an int, it must be the index of the feature to use. + If features is a string, it must be a valid column name in X. + If features is a tuple of strings, it must contain valid column int/names in X. class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will @@ -1189,7 +1189,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): """Combine y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. Args: - y_true (pd.Series, or np.ndarray): The real target values of the data + y_true (pd.Series, or np.ndarray): The real target values of the data. y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. Defaults to None. From 07a4f9391befedb8f4391898d424518c741a9f0b Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 10:59:35 -0400 Subject: [PATCH 38/62] revert graphs.py to see what changes --- evalml/model_understanding/graphs.py | 305 ++++++++++++--------------- 1 file changed, 137 insertions(+), 168 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 2715aaafb4..01c45e6340 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1,5 +1,3 @@ -"""Model understanding graphing utilities.""" - import copy import os import warnings @@ -44,9 +42,9 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): """Confusion matrix for binary and multiclass classification. - Args: + Arguments: y_true (pd.Series or np.ndarray): True binary labels. - y_predicted (pd.Series or np.ndarray): Predictions from a binary classifier. + y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. Returns: @@ -65,17 +63,14 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): def normalize_confusion_matrix(conf_mat, normalize_method="true"): - """Normalize a confusion matrix. + """Normalizes a confusion matrix. - Args: + Arguments: conf_mat (pd.DataFrame or np.ndarray): Confusion matrix to normalize. normalize_method ({'true', 'pred', 'all'}): Normalization method. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. Returns: pd.DataFrame: normalized version of the input confusion matrix. The column header represents the predicted labels while row header represents the actual labels. - - Raises: - ValueError: If configuration is invalid, or if the sum of a given axis is zero and normalization by axis is specified. """ conf_mat = infer_feature_types(conf_mat) col_names = conf_mat.columns @@ -109,14 +104,14 @@ def graph_confusion_matrix( If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'. - Args: + Arguments: y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. - title_addition (str or None): If not None, append to plot title. Defaults to None. + title_addition (str or None): if not None, append to plot title. Defaults to None. Returns: - plotly.Figure representing the confusion matrix plot generated. + plotly.Figure representing the confusion matrix plot generated """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -185,22 +180,21 @@ def graph_confusion_matrix( def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): - """Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. + """ + Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. - Args: + Arguments: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. pos_label_idx (int): the column index corresponding to the positive class. If predicted probabilities are two-dimensional, this will be used to access the probabilities for the positive class. Returns: list: Dictionary containing metrics used to generate a precision-recall plot, with the following keys: - * `precision`: Precision values. - * `recall`: Recall values. - * `thresholds`: Threshold values used to produce the precision and recall. - * `auc_score`: The area under the ROC curve. - Raises: - NoPositiveLabelException: If predicted probabilities do not contain a column at the specified label. + * `precision`: Precision values. + * `recall`: Recall values. + * `thresholds`: Threshold values used to produce the precision and recall. + * `auc_score`: The area under the ROC curve. """ y_true = infer_feature_types(y_true) y_pred_proba = infer_feature_types(y_pred_proba) @@ -227,10 +221,10 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): """Generate and display a precision-recall plot. - Args: + Arguments: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. - title_addition (str or None): If not None, append to plot title. Defaults to None. + title_addition (str or None): If not None, append to plot title. Default None. Returns: plotly.Figure representing the precision-recall plot generated @@ -264,19 +258,20 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): def roc_curve(y_true, y_pred_proba): - """Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. + """ + Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. - Args: + Arguments: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Returns: list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary. Each dictionary contains metrics used to generate an ROC plot with the following keys: - * `fpr_rate`: False positive rate. - * `tpr_rate`: True positive rate. - * `threshold`: Threshold values used to produce each pair of true/false positive rates. - * `auc_score`: The area under the ROC curve. + * `fpr_rate`: False positive rate. + * `tpr_rate`: True positive rate. + * `threshold`: Threshold values used to produce each pair of true/false positive rates. + * `auc_score`: The area under the ROC curve. """ y_true = infer_feature_types(y_true).to_numpy() y_pred_proba = infer_feature_types(y_pred_proba).to_numpy() @@ -315,17 +310,14 @@ def roc_curve(y_true, y_pred_proba): def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_addition=None): """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems. - Args: + Arguments: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. - custom_class_names (list): If not None, custom labels for classes. Defaults to None. - title_addition (str): If not None, append to plot title. Defaults to None. + custom_class_labels (list or None): If not None, custom labels for classes. Default None. + title_addition (str or None): if not None, append to plot title. Default None. Returns: - plotly.Figure representing the ROC plot generated. - - Raises: - ValueError: If the number of custom class names does not match number of classes in the input data. + plotly.Figure representing the ROC plot generated """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -377,18 +369,15 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio def graph_permutation_importance(pipeline, X, y, objective, importance_threshold=0): """Generate a bar graph of the pipeline's permutation importance. - Args: - pipeline (PipelineBase or subclass): Fitted pipeline. - X (pd.DataFrame): The input data used to score and compute permutation importance. - y (pd.Series): The target data. - objective (str, ObjectiveBase): Objective to score on. + Arguments: + pipeline (PipelineBase or subclass): Fitted pipeline + X (pd.DataFrame): The input data used to score and compute permutation importance + y (pd.Series): The target data + objective (str, ObjectiveBase): Objective to score on importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. - - Raises: - ValueError: If importance_threshold is not greater than or equal to 0. """ go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -437,21 +426,19 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): - """Compute objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. + """Computes objective score as a function of potential binary classification + decision thresholds for a fitted binary classification pipeline. - Args: - pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline. - X (pd.DataFrame): The input data used to compute objective score. - y (pd.Series): The target labels. - objective (ObjectiveBase obj, str): Objective used to score. - steps (int): Number of intervals to divide and calculate objective score at. + Arguments: + pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline + X (pd.DataFrame): The input data used to compute objective score + y (pd.Series): The target labels + objective (ObjectiveBase obj, str): Objective used to score + steps (int): Number of intervals to divide and calculate objective score at Returns: - pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold. + pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold - Raises: - ValueError: If objective is not a binary classification objective. - ValueError: If objective's `score_needs_proba` is not False. """ objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(ProblemTypes.BINARY): @@ -473,17 +460,18 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): - """Generate a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. + """Generates a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. - Args: - pipeline (PipelineBase or subclass): Fitted pipeline. - X (pd.DataFrame): The input data used to score and compute scores. - y (pd.Series): The target labels. - objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph. - steps (int): Number of intervals to divide and calculate objective score at. + Arguments: + pipeline (PipelineBase or subclass): Fitted pipeline + X (pd.DataFrame): The input data used to score and compute scores + y (pd.Series): The target labels + objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph + steps (int): Number of intervals to divide and calculate objective score at Returns: - plotly.Figure representing the objective score vs. threshold graph generated. + plotly.Figure representing the objective score vs. threshold graph generated + """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -517,11 +505,13 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): - """If the user is doing a two-way partial dependence plot and one of the features is categorical, we ensure that the categorical feature is the first element in the tuple that's passed to sklearn. + """If the user is doing a two-way partial dependence plot and one of the features is categorical, + we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. - This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the type of the first feature in - the tuple. If we put the categorical feature first, the grid will be of type 'object' which can accommodate both categorical and numeric - data. If we put the numeric feature first, the grid will be of type float64 and we can't coerce categoricals to float64 dtype. + This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the + type of the first feature in the tuple. If we put the categorical feature first, the grid will be of type 'object' + which can accommodate both categorical and numeric data. If we put the numeric feature first, the grid will be of + type float64 and we can't coerce categoricals to float64 dtype. """ new_features = features if first_feature_categorical else (features[1], features[0]) return new_features @@ -539,7 +529,7 @@ def _get_feature_names_from_str_or_col_index(X, names_or_col_indices): def _raise_value_error_if_any_features_all_nan(df): - """Validate partial dependence data by checking if any features have all NaN values.""" + """Helper for partial dependence data validation.""" nan_pct = df.isna().mean() all_nan = nan_pct[nan_pct == 1].index.tolist() all_nan = [f"'{name}'" for name in all_nan] @@ -553,7 +543,7 @@ def _raise_value_error_if_any_features_all_nan(df): def _raise_value_error_if_mostly_one_value(df, percentile): - """Validate partial dependence data by checking if features are mostly one value.""" + """Helper for partial dependence data validation.""" one_value = [] values = [] @@ -575,18 +565,17 @@ def _raise_value_error_if_mostly_one_value(df, percentile): def partial_dependence( pipeline, X, features, percentiles=(0.05, 0.95), grid_resolution=100, kind="average" ): - """Calculate one or two-way partial dependence. - - If a single integer or string is given for features, one-way partial dependence is calculated. - If a tuple of two integers or strings is given, two-way partial dependence - is calculated with the first feature in the y-axis and second feature in the x-axis. - - Args: - pipeline (PipelineBase or subclass): Fitted pipeline. + """Calculates one or two-way partial dependence. If a single integer or + string is given for features, one-way partial dependence is calculated. If + a tuple of two integers or strings is given, two-way partial dependence + is calculated with the first feature in the y-axis and second feature in the + x-axis. + + Arguments: + pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at. - features (int, string, tuple[int or string]): The target feature for which - to create the partial dependence plot for. + for feature where partial dependence will be calculated at + features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. If features is a tuple of int/strings, it must contain valid column integers/names in X. @@ -595,12 +584,12 @@ def partial_dependence( grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value is less than the maximum number of categories present in categorical data within X, it will be set to the max number of categories + 1. Defaults to 100. - kind ({'average', 'individual', 'both'}): The type of predictions to return. 'individual' will return the predictions for + kind {'average', 'individual', 'both'}: The type of predictions to return. 'individual' will return the predictions for all of the points in the grid for each sample in X. 'average' will return the predictions for all of the points in the grid but averaged over all of the samples in X. Returns: - pd.DataFrame, list[pd.DataFrame], or tuple[pd.DataFrame, list[pd.DataFrame]]: + pd.DataFrame, list(pd.DataFrame), or tuple(pd.DataFrame, list(pd.DataFrame)): When `kind='average'`: DataFrame with averaged predictions for all points in the grid averaged over all samples of X and the values used to calculate those predictions. @@ -623,14 +612,14 @@ def partial_dependence( feature value pair. Raises: - PartialDependenceError: If the user provides a tuple of not exactly two features. - PartialDependenceError: If the provided pipeline isn't fitted. - PartialDependenceError: If the provided pipeline is a Baseline pipeline. - PartialDependenceError: If any of the features passed in are completely NaN. - PartialDependenceError: If any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. - ValueError: Error during call to scikit-learn's partial dependence method. - Exception: All other errors during calculation. + PartialDependenceError: if the user provides a tuple of not exactly two features. + PartialDependenceError: if the provided pipeline isn't fitted. + PartialDependenceError: if the provided pipeline is a Baseline pipeline. + PartialDependenceError: if any of the features passed in are completely NaN + PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper + percentile passed by the user. By default 95%. """ + try: # Dynamically set the grid resolution to the maximum number of values # in the categorical/datetime variables if there are more categories/datetime values than resolution cells @@ -943,13 +932,17 @@ def _update_fig_with_two_way_partial_dependence( def graph_partial_dependence( pipeline, X, features, class_label=None, grid_resolution=100, kind="average" ): - """Create an one-way or two-way partial dependence plot. Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. - - Args: - pipeline (PipelineBase or subclass): Fitted pipeline. - X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at. - features (int, string, tuple[int or string]): The target feature for which to - create the partial dependence plot for. + """Create an one-way or two-way partial dependence plot. Passing a single integer or + string as features will create a one-way partial dependence plot with the feature values + plotted against the partial dependence. Passing features a tuple of int/strings will create + a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] + in the x-axis and the partial dependence in the z-axis. + + Arguments: + pipeline (PipelineBase or subclass): Fitted pipeline + X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values + for feature where partial dependence will be calculated at + features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. If features is a tuple of strings, it must contain valid column int/names in X. @@ -957,17 +950,17 @@ def graph_partial_dependence( the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. - grid_resolution (int): Number of samples of feature(s) for partial dependence plot. - kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence - (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a - single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. + grid_resolution (int): Number of samples of feature(s) for partial dependence plot + kind {'average', 'individual', 'both'}: Type of partial dependence to plot. 'average' creates a regular partial dependence + (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a + single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. Returns: - plotly.graph_objects.Figure: Figure object containing the partial dependence data for plotting. + plotly.graph_objects.Figure: figure object containing the partial dependence data for plotting Raises: - PartialDependenceError: If a graph is requested for a class name that isn't present in the pipeline. - PartialDependenceError: If an ICE plot is requested for a two-way partial dependence. + PartialDependenceError: if a graph is requested for a class name that isn't present in the pipeline. + PartialDependenceError: if an ICE plot is requested for a two-way partial dependence. """ X = infer_feature_types(X) if isinstance(features, (list, tuple)): @@ -1186,22 +1179,21 @@ def _calculate_axis_range(arr): def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): - """Combine y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. + """Combines y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. - Args: - y_true (pd.Series, or np.ndarray): The real target values of the data. + Arguments: + y_true (pd.Series, or np.ndarray): The real target values of the data y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference - between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. Defaults to None. + between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. + Defaults to None Returns: pd.DataFrame with the following columns: - * `prediction`: Predicted values from regression model. - * `actual`: Real target values. - * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. + * `prediction`: Predicted values from regression model. + * `actual`: Real target values. + * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. - Raises: - ValueError: If threshold is not positive. """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1227,19 +1219,18 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): - """Generate a scatter plot comparing the true and predicted values. Used for regression plotting. + """Generate a scatter plot comparing the true and predicted values. Used for regression plotting - Args: + Arguments: y_true (pd.Series): The real target values of the data y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference - between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. Defaults to None. + between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. + Defaults to None Returns: - plotly.Figure representing the predicted vs. actual values graph. + plotly.Figure representing the predicted vs. actual values graph - Raises: - ValueError: If threshold is not positive. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1312,17 +1303,13 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): - """Return data for a fitted tree in a restructured format. + """Return data for a fitted tree in a restructured format - Args: + Arguments: estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. - - Raises: - ValueError: If estimator is not a decision tree-based estimator. - NotFittedError: If estimator is not yet fitted. + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1339,17 +1326,13 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): - """Return data for a fitted pipeline with in a restructured format. + """Return data for a fitted pipeline with in a restructured format - Args: + Arguments: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. - - Raises: - ValueError: If input pipeline is not a decision tree model. - NotFittedError: If pipeline is not fitted. + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1369,21 +1352,20 @@ def decision_tree_data_from_pipeline(pipeline_): def visualize_decision_tree( estimator, max_depth=None, rotate=False, filled=False, filepath=None ): - """Generate an image visualizing the decision tree. + """Generate an image visualizing the decision tree - Args: + Arguments: estimator (ComponentBase): A fitted DecisionTree-based estimator. - max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. + max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), + tree is fully generated. rotate (bool, optional): Orient tree left to right rather than top-down. - filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. - filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. + filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for + regression, or purity of node for multi-output. + filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph + will not be saved. Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. - - Raises: - ValueError: If the estimator is not a decision tree estimator. - NotFittedError: If the estimator is not fitted yet. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1448,15 +1430,16 @@ def visualize_decision_tree( def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): """Get the data needed for the prediction_vs_actual_over_time plot. - Args: + Arguments: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame: Predictions vs time. + pd.DataFrame """ + dates = infer_feature_types(dates) y = infer_feature_types(y) prediction = pipeline.predict(X, y) @@ -1473,7 +1456,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): """Plot the target values and predictions against time on the x-axis. - Args: + Arguments: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1481,9 +1464,6 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): Returns: plotly.Figure: Showing the prediction vs actual over time. - - Raises: - ValueError: If the pipeline is not a time-series regression pipeline. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1524,18 +1504,14 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): def get_linear_coefficients(estimator, features=None): - """Return a dataframe showing the features with the greatest predictive power for a linear model. + """Returns a dataframe showing the features with the greatest predictive power for a linear model. - Args: + Arguments: estimator (Estimator): Fitted linear model family estimator. features (list[str]): List of feature names associated with the underlying data. Returns: pd.DataFrame: Displaying the features by importance. - - Raises: - ValueError: If the model is not a linear model. - NotFittedError: If the model is not yet fitted. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: raise ValueError( @@ -1566,21 +1542,17 @@ def t_sne( ): """Get the transformed output after fitting X to the embedded space using t-SNE. - Args: + Arguments: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Additional arbitrary arguments. Returns: np.ndarray (n_samples, n_components) - - Raises: - ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1611,25 +1583,22 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE. + """Plot high dimensional data into lower dimensional space using t-SNE . - Args: + Arguments: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning. - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning + algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - **kwargs: Additional keyword arguments to pass. Returns: - plotly.Figure: Figure representing the transformed data. + plotly.Figure representing the transformed data - Raises: - ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" From 5f48e125340bf7e4079bc0fbb64cbb1369c55754 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 11:32:05 -0400 Subject: [PATCH 39/62] add back a little --- evalml/model_understanding/graphs.py | 79 +++++++++++++++------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 01c45e6340..03f1a4afc5 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1,3 +1,4 @@ +"""Model understanding graphing utilities.""" import copy import os import warnings @@ -42,9 +43,9 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): """Confusion matrix for binary and multiclass classification. - Arguments: + Args: y_true (pd.Series or np.ndarray): True binary labels. - y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. + y_predicted (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. Returns: @@ -65,12 +66,15 @@ def confusion_matrix(y_true, y_predicted, normalize_method="true"): def normalize_confusion_matrix(conf_mat, normalize_method="true"): """Normalizes a confusion matrix. - Arguments: + Args: conf_mat (pd.DataFrame or np.ndarray): Confusion matrix to normalize. normalize_method ({'true', 'pred', 'all'}): Normalization method. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. Returns: pd.DataFrame: normalized version of the input confusion matrix. The column header represents the predicted labels while row header represents the actual labels. + + Raises: + ValueError: If configuration is invalid, or if the sum of a given axis is zero and normalization by axis is specified. """ conf_mat = infer_feature_types(conf_mat) col_names = conf_mat.columns @@ -104,14 +108,14 @@ def graph_confusion_matrix( If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'. - Arguments: + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred (pd.Series or np.ndarray): Predictions from a binary classifier. normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'. - title_addition (str or None): if not None, append to plot title. Defaults to None. + title_addition (str): If not None, append to plot title. Defaults to None. Returns: - plotly.Figure representing the confusion matrix plot generated + plotly.Figure representing the confusion matrix plot generated. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -180,10 +184,9 @@ def graph_confusion_matrix( def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): - """ - Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. + """Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. - Arguments: + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. pos_label_idx (int): the column index corresponding to the positive class. If predicted probabilities are two-dimensional, this will be used to access the probabilities for the positive class. @@ -221,10 +224,10 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): """Generate and display a precision-recall plot. - Arguments: + Args: y_true (pd.Series or np.ndarray): True binary labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. - title_addition (str or None): If not None, append to plot title. Default None. + title_addition (str or None): If not None, append to plot title. Defaults to None. Returns: plotly.Figure representing the precision-recall plot generated @@ -258,10 +261,9 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): def roc_curve(y_true, y_pred_proba): - """ - Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. + """Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. - Arguments: + Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. @@ -310,14 +312,17 @@ def roc_curve(y_true, y_pred_proba): def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_addition=None): """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems. - Arguments: + Args: y_true (pd.Series or np.ndarray): True labels. y_pred_proba (pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case. - custom_class_labels (list or None): If not None, custom labels for classes. Default None. - title_addition (str or None): if not None, append to plot title. Default None. + custom_class_names (list or None): If not None, custom labels for classes. Defaults to None. + title_addition (str or None): if not None, append to plot title. Defaults to None. Returns: plotly.Figure representing the ROC plot generated + + Raises: + ValueError: If the number of custom class names does not match number of classes in the input data. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -369,12 +374,12 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio def graph_permutation_importance(pipeline, X, y, objective, importance_threshold=0): """Generate a bar graph of the pipeline's permutation importance. - Arguments: - pipeline (PipelineBase or subclass): Fitted pipeline - X (pd.DataFrame): The input data used to score and compute permutation importance - y (pd.Series): The target data - objective (str, ObjectiveBase): Objective to score on - importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero. + Args: + pipeline (PipelineBase or subclass): Fitted pipeline. + X (pd.DataFrame): The input data used to score and compute permutation importance. + y (pd.Series): The target data. + objective (str, ObjectiveBase): Objective to score on. + importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to 0. Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. @@ -429,7 +434,7 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Computes objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. - Arguments: + Args: pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline X (pd.DataFrame): The input data used to compute objective score y (pd.Series): The target labels @@ -462,7 +467,7 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """Generates a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline. - Arguments: + Args: pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame): The input data used to score and compute scores y (pd.Series): The target labels @@ -571,7 +576,7 @@ def partial_dependence( is calculated with the first feature in the y-axis and second feature in the x-axis. - Arguments: + Args: pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at @@ -938,7 +943,7 @@ def graph_partial_dependence( a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. - Arguments: + Args: pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values for feature where partial dependence will be calculated at @@ -1181,7 +1186,7 @@ def _calculate_axis_range(arr): def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): """Combines y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`. - Arguments: + Args: y_true (pd.Series, or np.ndarray): The real target values of the data y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1221,7 +1226,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): """Generate a scatter plot comparing the true and predicted values. Used for regression plotting - Arguments: + Args: y_true (pd.Series): The real target values of the data y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference @@ -1305,7 +1310,7 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): """Return data for a fitted tree in a restructured format - Arguments: + Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: @@ -1328,7 +1333,7 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): """Return data for a fitted pipeline with in a restructured format - Arguments: + Args: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: @@ -1354,7 +1359,7 @@ def visualize_decision_tree( ): """Generate an image visualizing the decision tree - Arguments: + Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. @@ -1430,7 +1435,7 @@ def visualize_decision_tree( def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): """Get the data needed for the prediction_vs_actual_over_time plot. - Arguments: + Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1456,7 +1461,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): """Plot the target values and predictions against time on the x-axis. - Arguments: + Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. @@ -1506,7 +1511,7 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): def get_linear_coefficients(estimator, features=None): """Returns a dataframe showing the features with the greatest predictive power for a linear model. - Arguments: + Args: estimator (Estimator): Fitted linear model family estimator. features (list[str]): List of feature names associated with the underlying data. @@ -1542,7 +1547,7 @@ def t_sne( ): """Get the transformed output after fitting X to the embedded space using t-SNE. - Arguments: + Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning @@ -1585,7 +1590,7 @@ def graph_t_sne( ): """Plot high dimensional data into lower dimensional space using t-SNE . - Arguments: + Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning From a5d4b1245ba02a5f5caa8ae9248d4b8997b70e2b Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 12:27:12 -0400 Subject: [PATCH 40/62] add back a little more --- evalml/model_understanding/graphs.py | 60 +++++++++++++++------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 03f1a4afc5..6e980b1a9a 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -198,6 +198,9 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): * `recall`: Recall values. * `thresholds`: Threshold values used to produce the precision and recall. * `auc_score`: The area under the ROC curve. + + Raises: + NoPositiveLabelException: If predicted probabilities do not contain a column at the specified label. """ y_true = infer_feature_types(y_true) y_pred_proba = infer_feature_types(y_pred_proba) @@ -383,6 +386,9 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. + + Raises: + ValueError: If importance_threshold is not greater than or equal to 0. """ go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -431,19 +437,21 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): - """Computes objective score as a function of potential binary classification - decision thresholds for a fitted binary classification pipeline. + """Computes objective score as a function of potential binary classification decision thresholds for a fitted binary classification pipeline. Args: - pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline - X (pd.DataFrame): The input data used to compute objective score - y (pd.Series): The target labels - objective (ObjectiveBase obj, str): Objective used to score - steps (int): Number of intervals to divide and calculate objective score at + pipeline (BinaryClassificationPipeline obj): Fitted binary classification pipeline. + X (pd.DataFrame): The input data used to compute objective score. + y (pd.Series): The target labels. + objective (ObjectiveBase obj, str): Objective used to score. + steps (int): Number of intervals to divide and calculate objective score at. Returns: - pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold + pd.DataFrame: DataFrame with thresholds and the corresponding objective score calculated at each threshold. + Raises: + ValueError: If objective is not a binary classification objective. + ValueError: If objective's `score_needs_proba` is not False. """ objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(ProblemTypes.BINARY): @@ -510,8 +518,7 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): - """If the user is doing a two-way partial dependence plot and one of the features is categorical, - we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. + """If the user is doing a two-way partial dependence plot and one of the features is categorical, we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the type of the first feature in the tuple. If we put the categorical feature first, the grid will be of type 'object' @@ -570,11 +577,11 @@ def _raise_value_error_if_mostly_one_value(df, percentile): def partial_dependence( pipeline, X, features, percentiles=(0.05, 0.95), grid_resolution=100, kind="average" ): - """Calculates one or two-way partial dependence. If a single integer or - string is given for features, one-way partial dependence is calculated. If + """Calculates one or two-way partial dependence. + + If a single integer or string is given for features, one-way partial dependence is calculated. If a tuple of two integers or strings is given, two-way partial dependence - is calculated with the first feature in the y-axis and second feature in the - x-axis. + is calculated with the first feature in the y-axis and second feature in the x-axis. Args: pipeline (PipelineBase or subclass): Fitted pipeline @@ -589,7 +596,7 @@ def partial_dependence( grid_resolution (int): Number of samples of feature(s) for partial dependence plot. If this value is less than the maximum number of categories present in categorical data within X, it will be set to the max number of categories + 1. Defaults to 100. - kind {'average', 'individual', 'both'}: The type of predictions to return. 'individual' will return the predictions for + kind ({'average', 'individual', 'both'}): The type of predictions to return. 'individual' will return the predictions for all of the points in the grid for each sample in X. 'average' will return the predictions for all of the points in the grid but averaged over all of the samples in X. @@ -624,7 +631,6 @@ def partial_dependence( PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. """ - try: # Dynamically set the grid resolution to the maximum number of values # in the categorical/datetime variables if there are more categories/datetime values than resolution cells @@ -937,8 +943,9 @@ def _update_fig_with_two_way_partial_dependence( def graph_partial_dependence( pipeline, X, features, class_label=None, grid_resolution=100, kind="average" ): - """Create an one-way or two-way partial dependence plot. Passing a single integer or - string as features will create a one-way partial dependence plot with the feature values + """Create an one-way or two-way partial dependence plot. + + Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create a two-way partial dependence plot with a contour of feature[0] in the y-axis, feature[1] in the x-axis and the partial dependence in the z-axis. @@ -1191,7 +1198,7 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): y_pred (pd.Series, or np.ndarray): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None + Defaults to None. Returns: pd.DataFrame with the following columns: @@ -1224,14 +1231,14 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): - """Generate a scatter plot comparing the true and predicted values. Used for regression plotting + """Generate a scatter plot comparing the true and predicted values. Used for regression plotting. Args: - y_true (pd.Series): The real target values of the data + y_true (pd.Series): The real target values of the data. y_pred (pd.Series): The predicted values outputted by the regression model. outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow. - Defaults to None + Defaults to None. Returns: plotly.Figure representing the predicted vs. actual values graph @@ -1308,13 +1315,13 @@ def recurse(i): def decision_tree_data_from_estimator(estimator): - """Return data for a fitted tree in a restructured format + """Return data for a fitted tree in a restructured format. Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1331,7 +1338,7 @@ def decision_tree_data_from_estimator(estimator): def decision_tree_data_from_pipeline(pipeline_): - """Return data for a fitted pipeline with in a restructured format + """Return data for a fitted pipeline in a restructured format. Args: pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. @@ -1357,7 +1364,7 @@ def decision_tree_data_from_pipeline(pipeline_): def visualize_decision_tree( estimator, max_depth=None, rotate=False, filled=False, filepath=None ): - """Generate an image visualizing the decision tree + """Generate an image visualizing the decision tree. Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. @@ -1444,7 +1451,6 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): Returns: pd.DataFrame """ - dates = infer_feature_types(dates) y = infer_feature_types(y) prediction = pipeline.predict(X, y) From 1111632fa1a84bce01e7932918c0deea6416b200 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 13:49:26 -0400 Subject: [PATCH 41/62] add back more, ignoring indentation errors likely causing RtD failures --- evalml/model_understanding/graphs.py | 48 +++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 6e980b1a9a..1f30df92ae 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -630,6 +630,8 @@ def partial_dependence( PartialDependenceError: if any of the features passed in are completely NaN PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. + ValueError: Error during call to scikit-learn's partial dependence method. + Exception: All other errors during calculation. """ try: # Dynamically set the grid resolution to the maximum number of values @@ -951,9 +953,9 @@ def graph_partial_dependence( in the x-axis and the partial dependence in the z-axis. Args: - pipeline (PipelineBase or subclass): Fitted pipeline + pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at + for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. @@ -962,8 +964,8 @@ def graph_partial_dependence( the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. - grid_resolution (int): Number of samples of feature(s) for partial dependence plot - kind {'average', 'individual', 'both'}: Type of partial dependence to plot. 'average' creates a regular partial dependence + grid_resolution (int): Number of samples of feature(s) for partial dependence plot. + kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. @@ -1206,6 +1208,8 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): * `actual`: Real target values. * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. + Raises: + ValueError: If threshold is not positive. """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1243,6 +1247,8 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): Returns: plotly.Figure representing the predicted vs. actual values graph + Raises: + ValueError: If threshold is not positive. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1322,6 +1328,10 @@ def decision_tree_data_from_estimator(estimator): Returns: OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1344,7 +1354,11 @@ def decision_tree_data_from_pipeline(pipeline_): pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1378,6 +1392,10 @@ def visualize_decision_tree( Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1449,7 +1467,7 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame + pd.DataFrame: Predictions vs. time. """ dates = infer_feature_types(dates) y = infer_feature_types(y) @@ -1475,6 +1493,9 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): Returns: plotly.Figure: Showing the prediction vs actual over time. + + Raises: + ValueError: If the pipeline is not a time-series regression pipeline. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1523,6 +1544,10 @@ def get_linear_coefficients(estimator, features=None): Returns: pd.DataFrame: Displaying the features by importance. + + Raises: + ValueError: If the model is not a linear model. + NotFittedError: If the model is not yet fitted. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: raise ValueError( @@ -1561,9 +1586,13 @@ def t_sne( learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + **kwargs: Additional keyword arguments to pass. Returns: - np.ndarray (n_samples, n_components) + np.ndarray (n_samples, n_components). + + Raises: + ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1606,10 +1635,13 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. + **kwargs: Additional keyword arguments to pass. Returns: - plotly.Figure representing the transformed data + plotly.Figure: Figure representing the transformed data. + Raises: + ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" From 970f246c834aa632c61f65df7f5d3f34544c84e8 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 14:37:12 -0400 Subject: [PATCH 42/62] test kwargs --- evalml/model_understanding/graphs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 15ddd653b7..a4cfab3a42 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -624,14 +624,14 @@ def partial_dependence( feature value pair. Raises: + ValueError: Error during call to scikit-learn's partial dependence method. + Exception: All other errors during calculation. PartialDependenceError: if the user provides a tuple of not exactly two features. PartialDependenceError: if the provided pipeline isn't fitted. PartialDependenceError: if the provided pipeline is a Baseline pipeline. PartialDependenceError: if any of the features passed in are completely NaN PartialDependenceError: if any of the features are low-variance. Defined as having one value occurring more than the upper percentile passed by the user. By default 95%. - ValueError: Error during call to scikit-learn's partial dependence method. - Exception: All other errors during calculation. """ try: # Dynamically set the grid resolution to the maximum number of values @@ -1591,7 +1591,6 @@ def t_sne( learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Additional keyword arguments to pass. Returns: np.ndarray (n_samples, n_components). @@ -1640,7 +1639,7 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - **kwargs: Additional keyword arguments to pass. + **kwargs: Additional abritrary parameters. Returns: plotly.Figure: Figure representing the transformed data. From 0c67410094f5f30fabfc279818e1f528d36bb41b Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 15:01:10 -0400 Subject: [PATCH 43/62] add another kwargs --- evalml/model_understanding/graphs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index a4cfab3a42..34e0df1e63 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1591,6 +1591,7 @@ def t_sne( learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + **kwargs: Additional abritrary parameters. Returns: np.ndarray (n_samples, n_components). From 10ab2861069d93258ee044fa195992ec9f744fe1 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 15:35:16 -0400 Subject: [PATCH 44/62] attempt to fix kwargs --- evalml/model_understanding/graphs.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 34e0df1e63..16892d1f9c 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1586,10 +1586,8 @@ def t_sne( Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. **kwargs: Additional abritrary parameters. @@ -1628,7 +1626,7 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE . + """Plot high dimensional data into lower dimensional space using t-SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. From 622342bea1bd64355d92a2a4becf30be9a40aee9 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 16:21:22 -0400 Subject: [PATCH 45/62] lost attempt again at fixing kwargs --- evalml/model_understanding/graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 16892d1f9c..6571cdb30c 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1592,7 +1592,7 @@ def t_sne( **kwargs: Additional abritrary parameters. Returns: - np.ndarray (n_samples, n_components). + np.ndarray (n_samples, n_components): TSNE output. Raises: ValueError: If specified parameters are not valid values. From eb94c8992d7e505f00f90aa13cddaa39f3738625 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 16:41:28 -0400 Subject: [PATCH 46/62] remove to confirm issue --- evalml/model_understanding/graphs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 6571cdb30c..bf5bfa3da9 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1638,7 +1638,6 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - **kwargs: Additional abritrary parameters. Returns: plotly.Figure: Figure representing the transformed data. From 62bfa2695843868f65a69367000c0d34562fa577 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 17:10:10 -0400 Subject: [PATCH 47/62] revert --- docs/source/release_notes.rst | 26 ++--- docs/source/user_guide/timeseries.ipynb | 37 +++++- evalml/model_understanding/graphs.py | 70 +++--------- .../_user_interface.py | 10 +- .../latest_dependency_versions.txt | 2 +- .../test_explainers.py | 108 ------------------ .../model_understanding_tests/test_graphs.py | 75 +++++------- 7 files changed, 101 insertions(+), 227 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 2ad1326f38..7c5b206fc7 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,10 +6,6 @@ Release Notes * Added label encoder to ``XGBoostClassifier`` to remove the warning :pr:`2701` * Set ``eval_metric`` to ``logloss`` for ``XGBoostClassifier`` :pr:`2741` * Added support for ``woodwork`` versions ``0.7.0`` and ``0.7.1`` :pr:`2743` - * Changed ``explain_predictions`` functions to display original feature values :pr:`2759` - * Added ``X_train`` and ``y_train`` to ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` :pr:`2762` - * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` - * Added ``predict_in_sample`` and ``predict_proba_in_sample`` methods to time series pipelines to predict on data where the target is known, e.g. cross-validation :pr:`2697` * Fixes * Fixed bug where ``_catch_warnings`` assumed all warnings were ``PipelineNotUsed`` :pr:`2753` * Fixed bug where ``Imputer.transform`` would erase ww typing information prior to handing data to the ``SimpleImputer`` :pr:`2752` @@ -18,15 +14,10 @@ Release Notes * Deleted ``drop_nan_target_rows`` utility method :pr:`2737` * Removed default logging setup and debugging log file :pr:`2645` * Changed the default n_jobs value for ``XGBoostClassifier`` and ``XGBoostRegressor`` to 12 :pr:`2757` - * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` - * Added ``X_train`` and ``y_train`` as optional parameters to pipeline ``predict``, ``predict_proba``. Only used for time series pipelines :pr:`2697` - * Added ``training_data`` and ``training_target`` as optional parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` to support time series pipelines :pr:`2697` - * Changed time series pipeline predictions to no longer output series/dataframes padded with NaNs. A prediction will be returned for every row in the `X` input :pr:`2697` * Documentation Changes * Specified installation steps for Prophet :pr:`2713` * Added documentation for data exploration on data check actions :pr:`2696` * Added docstring linting package ``pydocstyle`` and rule to `make-lint` command :pr:`2670` - * Added a user guide entry for time series modelling :pr:`2697` * Testing Changes * Fixed flaky ``TargetDistributionDataCheck`` test for very_lognormal distribution :pr:`2748` @@ -34,11 +25,6 @@ Release Notes **Breaking Changes** * Removed default logging setup and debugging log file :pr:`2645` - * Added ``X_train`` and ``y_train`` to ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` :pr:`2762` - * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` - * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` - * Added ``X_train`` and ``y_train`` as required parameters for ``predict`` and ``predict_proba`` in time series pipelines :pr:`2697` - * Added ``training_data`` and ``training_target`` as required parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` for time series pipelines :pr:`2697` **v0.32.0 Aug. 31, 2021** * Enhancements @@ -51,6 +37,8 @@ Release Notes * Updated pipeline ``graph()`` to distingush X and y edges :pr:`2654` * Added ``DropRowsTransformer`` component :pr:`2692` * Added ``DROP_ROWS`` to ``_make_component_list_from_actions`` and clean up metadata :pr:`2694` + * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` + * Added ``predict_in_sample`` and ``predict_proba_in_sample`` methods to time series pipelines to predict on data where the target is known, e.g. cross-validation :pr:`2697` * Fixes * Updated Oversampler logic to select best SMOTE based on component input instead of pipeline input :pr:`2695` * Added ability to explicitly close DaskEngine resources to improve runtime and reduce Dask warnings :pr:`2667` @@ -59,9 +47,14 @@ Release Notes * Changes * Replaced ``SMOTEOversampler``, ``SMOTENOversampler`` and ``SMOTENCOversampler`` with consolidated ``Oversampler`` component :pr:`2695` * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` + * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` + * Added ``X_train`` and ``y_train`` as optional parameters to pipeline ``predict``, ``predict_proba``. Only used for time series pipelines :pr:`2697` + * Added ``training_data`` and ``training_target`` as optional parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` to support time series pipelines :pr:`2697` + * Changed time series pipeline predictions to no longer output series/dataframes padded with NaNs. A prediction will be returned for every row in the `X` input :pr:`2697` * Documentation Changes * Added user guide documentation for using ``ComponentGraph`` and added ``ComponentGraph`` to API reference :pr:`2673` * Updated documentation to make parallelization of AutoML clearer :pr:`2667` + * Added a user guide entry for time series modelling :pr:`2697` * Testing Changes * Removes the process-level parallelism from the ``test_cancel_job`` test :pr:`2666` * Installed numba 0.53 in windows CI to prevent problems installing version 0.54 :pr:`2710` @@ -72,6 +65,11 @@ Release Notes * Renamed the current top level ``search`` method to ``search_iterative`` and defined a new ``search`` method for the ``DefaultAlgorithm`` :pr:`2634` * Replaced ``SMOTEOversampler``, ``SMOTENOversampler`` and ``SMOTENCOversampler`` with consolidated ``Oversampler`` component :pr:`2695` * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` + * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` + * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` + * Added ``X_train`` and ``y_train`` as required parameters for ``predict`` and ``predict_proba`` in time series pipelines :pr:`2697` + * Added ``training_data`` and ``training_target`` as required parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` for time series pipelines :pr:`2697` + **v0.31.0 Aug. 19, 2021** * Enhancements diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb index 3b2364e858..d795552e7d 100644 --- a/docs/source/user_guide/timeseries.ipynb +++ b/docs/source/user_guide/timeseries.ipynb @@ -274,10 +274,41 @@ "metadata": {}, "outputs": [], "source": [ - "from evalml.model_understanding import graph_prediction_vs_actual_over_time\n", + "data = pd.DataFrame({\"dates\": X_test.Date,\n", + " \"prediction\": pl.predict_in_sample(X_test, y_test, X_train, y_train),\n", + " \"target\": y_test})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " go.Scatter(\n", + " x=data[\"dates\"],\n", + " y=data[\"target\"],\n", + " mode=\"lines+markers\",\n", + " name=\"Target\",\n", + " line=dict(color=\"#1f77b4\"),\n", + " ),\n", + " go.Scatter(\n", + " x=data[\"dates\"],\n", + " y=data[\"prediction\"],\n", + " mode=\"lines+markers\",\n", + " name=\"Prediction\",\n", + " line=dict(color=\"#d62728\"),\n", + " ),\n", + "]\n", + "# Let plotly pick the best date format.\n", + "layout = go.Layout(\n", + " title={\"text\": \"Prediction vs Target over time\"},\n", + " xaxis={\"title\": \"Time\"},\n", + " yaxis={\"title\": \"Target Values and Predictions\"},\n", + ")\n", "\n", - "fig = graph_prediction_vs_actual_over_time(pl, X_test, y_test, X_train, y_train, dates=X_test['Date'])\n", - "fig" + "go.Figure(data=data, layout=layout)" ] }, { diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index bf5bfa3da9..6e980b1a9a 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -624,8 +624,6 @@ def partial_dependence( feature value pair. Raises: - ValueError: Error during call to scikit-learn's partial dependence method. - Exception: All other errors during calculation. PartialDependenceError: if the user provides a tuple of not exactly two features. PartialDependenceError: if the provided pipeline isn't fitted. PartialDependenceError: if the provided pipeline is a Baseline pipeline. @@ -953,9 +951,9 @@ def graph_partial_dependence( in the x-axis and the partial dependence in the z-axis. Args: - pipeline (PipelineBase or subclass): Fitted pipeline. + pipeline (PipelineBase or subclass): Fitted pipeline X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at. + for feature where partial dependence will be calculated at features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. @@ -964,8 +962,8 @@ def graph_partial_dependence( the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. - grid_resolution (int): Number of samples of feature(s) for partial dependence plot. - kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence + grid_resolution (int): Number of samples of feature(s) for partial dependence plot + kind {'average', 'individual', 'both'}: Type of partial dependence to plot. 'average' creates a regular partial dependence (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. @@ -1208,8 +1206,6 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): * `actual`: Real target values. * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. - Raises: - ValueError: If threshold is not positive. """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1247,8 +1243,6 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): Returns: plotly.Figure representing the predicted vs. actual values graph - Raises: - ValueError: If threshold is not positive. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1328,10 +1322,6 @@ def decision_tree_data_from_estimator(estimator): Returns: OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. - - Raises: - ValueError: If estimator is not a decision tree-based estimator. - NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1354,11 +1344,7 @@ def decision_tree_data_from_pipeline(pipeline_): pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. - - Raises: - ValueError: If estimator is not a decision tree-based estimator. - NotFittedError: If estimator is not yet fitted. + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1392,10 +1378,6 @@ def visualize_decision_tree( Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. - - Raises: - ValueError: If estimator is not a decision tree-based estimator. - NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1457,22 +1439,21 @@ def visualize_decision_tree( return source_obj -def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, dates): +def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): """Get the data needed for the prediction_vs_actual_over_time plot. Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. - X_train (pd.DataFrame): Data the pipeline was trained on. - y_train (pd.Series): Target values for training data. dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame: Predictions vs. time. + pd.DataFrame """ dates = infer_feature_types(dates) - prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train) + y = infer_feature_types(y) + prediction = pipeline.predict(X, y) return pd.DataFrame( { @@ -1483,22 +1464,17 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, da ) -def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates): +def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): """Plot the target values and predictions against time on the x-axis. Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. - X_train (pd.DataFrame): Data the pipeline was trained on. - y_train (pd.Series): Target values for training data. dates (pd.Series): Dates corresponding to target values and predictions. Returns: plotly.Figure: Showing the prediction vs actual over time. - - Raises: - ValueError: If the pipeline is not a time-series regression pipeline. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1510,9 +1486,7 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates f"Received {str(pipeline.problem_type)}." ) - data = get_prediction_vs_actual_over_time_data( - pipeline, X, y, X_train, y_train, dates - ) + data = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) data = [ _go.Scatter( @@ -1549,10 +1523,6 @@ def get_linear_coefficients(estimator, features=None): Returns: pd.DataFrame: Displaying the features by importance. - - Raises: - ValueError: If the model is not a linear model. - NotFittedError: If the model is not yet fitted. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: raise ValueError( @@ -1586,16 +1556,14 @@ def t_sne( Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning + algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad + local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Additional abritrary parameters. Returns: - np.ndarray (n_samples, n_components): TSNE output. - - Raises: - ValueError: If specified parameters are not valid values. + np.ndarray (n_samples, n_components) """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1626,7 +1594,7 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE. + """Plot high dimensional data into lower dimensional space using t-SNE . Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. @@ -1640,10 +1608,8 @@ def graph_t_sne( marker_size (int, optional): Determines the size of the marker. Returns: - plotly.Figure: Figure representing the transformed data. + plotly.Figure representing the transformed data - Raises: - ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index 2ae64db1be..63a65fe08b 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -53,12 +53,12 @@ def _make_rows( display_text = symbol * min(int(abs(value) // 0.2) + 1, 5) # At this point, the feature is either in the original data or the data - # the final estimator sees, or both. We use the original feature value if possible - is_original_feature = feature_name in original_features.columns - if is_original_feature: - feature_value = original_features[feature_name].iloc[0] - else: + # the final estimator sees. So if it is not a pipeline feature, it is + # an original feature + if feature_name in pipeline_features.columns: feature_value = pipeline_features[feature_name].iloc[0] + else: + feature_value = original_features[feature_name].iloc[0] if convert_numeric_to_string: if pd.api.types.is_number(feature_value) and not pd.api.types.is_bool( diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 6a91ae6ff7..e113b73321 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -1,6 +1,6 @@ catboost==0.26.1 click==8.0.1 -cloudpickle==2.0.0 +cloudpickle==1.6.0 colorama==0.4.4 dask==2021.9.0 featuretools==0.27.1 diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 149e64337d..373b1a220e 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1782,111 +1782,3 @@ def test_explain_predictions_url_email(df_with_url_and_email): .isnull() .any() ) - - -@pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) -def test_explain_predictions_report_shows_original_value_if_possible( - pipeline_class, estimator, fraud_100 -): - X, y = fraud_100 - X.ww.set_types({"country": "NaturalLanguage"}) - component_graph = [ - "Imputer", - "DateTime Featurization Component", - "Text Featurization Component", - "One Hot Encoder", - "Standard Scaler", - estimator, - ] - parameters = { - estimator: {"n_jobs": 1}, - } - pipeline = pipeline_class(component_graph=component_graph, parameters=parameters) - - y = transform_y_for_problem_type(pipeline.problem_type, y) - - pipeline.fit(X, y) - - report = explain_predictions( - pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=20 - ) - expected_feature_values = set(X.ww.iloc[0, :].tolist()) - for explanation in report["explanations"][0]["explanations"]: - assert set(explanation["feature_names"]) == set(X.columns) - assert set(explanation["feature_values"]) == expected_feature_values - - X_null = X.ww.copy() - X_null.loc[0, "lat"] = None - X_null.ww.init(schema=X.ww.schema) - - report = explain_predictions( - pipeline, - X_null, - y, - indices_to_explain=[0], - output_format="dict", - top_k_features=20, - ) - for explanation in report["explanations"][0]["explanations"]: - assert set(explanation["feature_names"]) == set(X.columns) - for feature_name, feature_value in zip( - explanation["feature_names"], explanation["feature_values"] - ): - if feature_name == "lat": - assert np.isnan(feature_value) - - -def test_explain_predictions_best_worst_report_shows_original_value_if_possible( - fraud_100, -): - X, y = fraud_100 - X.ww.set_types({"country": "NaturalLanguage"}) - component_graph = [ - "Imputer", - "DateTime Featurization Component", - "Text Featurization Component", - "One Hot Encoder", - "Standard Scaler", - "Random Forest Classifier", - ] - parameters = { - "Random Forest Classifier": {"n_jobs": 1}, - } - pipeline = BinaryClassificationPipeline( - component_graph=component_graph, parameters=parameters - ) - - y = transform_y_for_problem_type(pipeline.problem_type, y) - - pipeline.fit(X, y) - report = explain_predictions_best_worst( - pipeline, X, y, num_to_explain=1, output_format="dict", top_k_features=20 - ) - - for index, explanation in enumerate(report["explanations"]): - for exp in explanation["explanations"]: - assert set(exp["feature_names"]) == set(X.columns) - assert set(exp["feature_values"]) == set( - X.ww.iloc[explanation["predicted_values"]["index_id"], :] - ) - - X_null = X.ww.copy() - X_null.loc[0:2, "lat"] = None - X_null.ww.init(schema=X.ww.schema) - - report = explain_predictions_best_worst( - pipeline, - X_null.ww.iloc[:2], - y.ww.iloc[:2], - num_to_explain=1, - output_format="dict", - top_k_features=20, - ) - for explanation in report["explanations"]: - for exp in explanation["explanations"]: - assert set(exp["feature_names"]) == set(X.columns) - for feature_name, feature_value in zip( - exp["feature_names"], exp["feature_values"] - ): - if feature_name == "lat": - assert np.isnan(feature_value) diff --git a/evalml/tests/model_understanding_tests/test_graphs.py b/evalml/tests/model_understanding_tests/test_graphs.py index fb4071ca20..78029c0c2e 100644 --- a/evalml/tests/model_understanding_tests/test_graphs.py +++ b/evalml/tests/model_understanding_tests/test_graphs.py @@ -43,7 +43,6 @@ LinearRegressor, MulticlassClassificationPipeline, RegressionPipeline, - TimeSeriesRegressionPipeline, ) from evalml.problem_types import ProblemTypes from evalml.utils import get_random_state, infer_feature_types @@ -995,56 +994,46 @@ def test_graph_prediction_vs_actual(data_type): assert fig_dict["data"][2]["name"] == ">= outlier_threshold" -def test_get_prediction_vs_actual_over_time_data(ts_data): - X, y = ts_data - X_train, y_train = X.iloc[:15], y.iloc[:15] - X_test, y_test = X.iloc[15:], y.iloc[15:] - - pipeline = TimeSeriesRegressionPipeline( - ["Elastic Net Regressor"], - parameters={ - "pipeline": { - "gap": 0, - "max_delay": 2, - "forecast_horizon": 1, - "date_index": None, - } - }, +@patch("evalml.pipelines.ClassificationPipeline.predict") +@pytest.mark.parametrize("data_type", ["pd", "ww"]) +def test_get_prediction_vs_actual_over_time_data( + mock_predict, data_type, logistic_regression_binary_pipeline_class, make_data_type +): + mock_predict.return_value = pd.Series([0] * 20) + X = make_data_type(data_type, pd.DataFrame()) + y = make_data_type(data_type, pd.Series([0] * 20)) + dates = make_data_type( + data_type, pd.Series(pd.date_range("2000-05-19", periods=20, freq="D")) ) - pipeline.fit(X_train, y_train) - results = get_prediction_vs_actual_over_time_data( - pipeline, X_test, y_test, X_train, y_train, pd.Series(X_test.index) - ) + pipeline = logistic_regression_binary_pipeline_class(parameters={}) + results = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) assert isinstance(results, pd.DataFrame) assert list(results.columns) == ["dates", "target", "prediction"] -def test_graph_prediction_vs_actual_over_time(ts_data): +def test_graph_prediction_vs_actual_over_time(): go = pytest.importorskip( "plotly.graph_objects", reason="Skipping plotting test because plotly not installed", ) - X, y = ts_data - X_train, y_train = X.iloc[:15], y.iloc[:15] - X_test, y_test = X.iloc[15:], y.iloc[15:] + class MockPipeline: + problem_type = ProblemTypes.TIME_SERIES_REGRESSION - pipeline = TimeSeriesRegressionPipeline( - ["Elastic Net Regressor"], - parameters={ - "pipeline": { - "gap": 0, - "max_delay": 2, - "forecast_horizon": 1, - "date_index": None, - } - }, - ) - pipeline.fit(X_train, y_train) + def predict(self, X, y): + y = infer_feature_types(y) + preds = y + 10 + preds.index = range(100, 161) + return preds + y = pd.Series(np.arange(61), index=range(200, 261)) + dates = pd.Series(pd.date_range("2020-03-01", "2020-04-30")) + pipeline = MockPipeline() + + # For this test it doesn't matter what the features are fig = graph_prediction_vs_actual_over_time( - pipeline, X_test, y_test, X_train, y_train, pd.Series(X_test.index) + pipeline, X=pd.DataFrame(), y=y, dates=dates ) assert isinstance(fig, go.Figure) @@ -1056,12 +1045,12 @@ def test_graph_prediction_vs_actual_over_time(ts_data): ) assert len(fig_dict["data"]) == 2 assert fig_dict["data"][0]["line"]["color"] == "#1f77b4" - assert len(fig_dict["data"][0]["x"]) == X_test.shape[0] + assert len(fig_dict["data"][0]["x"]) == 61 assert not np.isnan(fig_dict["data"][0]["y"]).all() - assert len(fig_dict["data"][0]["y"]) == X_test.shape[0] + assert len(fig_dict["data"][0]["y"]) == 61 assert fig_dict["data"][1]["line"]["color"] == "#d62728" - assert len(fig_dict["data"][1]["x"]) == X_test.shape[0] - assert len(fig_dict["data"][1]["y"]) == X_test.shape[0] + assert len(fig_dict["data"][1]["x"]) == 61 + assert len(fig_dict["data"][1]["y"]) == 61 assert not np.isnan(fig_dict["data"][1]["y"]).all() @@ -1076,9 +1065,7 @@ class NotTSPipeline: error_msg = "graph_prediction_vs_actual_over_time only supports time series regression pipelines! Received regression." with pytest.raises(ValueError, match=error_msg): - graph_prediction_vs_actual_over_time( - NotTSPipeline(), None, None, None, None, None - ) + graph_prediction_vs_actual_over_time(NotTSPipeline(), None, None, None) def test_decision_tree_data_from_estimator_not_fitted(tree_estimators): From 067595d4334e90458d3e5cd338c29b56fe87b11e Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 23:08:08 -0400 Subject: [PATCH 48/62] revert --- docs/source/release_notes.rst | 37 +++--- docs/source/user_guide/timeseries.ipynb | 37 +----- evalml/__init__.py | 2 +- evalml/model_understanding/graphs.py | 70 +++++++++--- .../_user_interface.py | 10 +- .../latest_dependency_versions.txt | 2 +- .../test_explainers.py | 108 ++++++++++++++++++ .../model_understanding_tests/test_graphs.py | 75 +++++++----- setup.py | 2 +- 9 files changed, 229 insertions(+), 114 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index b36969d5ba..2ad1326f38 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -1,22 +1,15 @@ Release Notes ------------- **Future Releases** - * Enhancements - * Fixes - * Changes - * Testing Changes - -.. warning:: - - **Breaking Changes** - - -**v0.32.1 Sep. 10, 2021** * Enhancements * Added ``verbose`` flag to ``AutoMLSearch`` to run search in silent mode by default :pr:`2645` * Added label encoder to ``XGBoostClassifier`` to remove the warning :pr:`2701` * Set ``eval_metric`` to ``logloss`` for ``XGBoostClassifier`` :pr:`2741` * Added support for ``woodwork`` versions ``0.7.0`` and ``0.7.1`` :pr:`2743` + * Changed ``explain_predictions`` functions to display original feature values :pr:`2759` + * Added ``X_train`` and ``y_train`` to ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` :pr:`2762` + * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` + * Added ``predict_in_sample`` and ``predict_proba_in_sample`` methods to time series pipelines to predict on data where the target is known, e.g. cross-validation :pr:`2697` * Fixes * Fixed bug where ``_catch_warnings`` assumed all warnings were ``PipelineNotUsed`` :pr:`2753` * Fixed bug where ``Imputer.transform`` would erase ww typing information prior to handing data to the ``SimpleImputer`` :pr:`2752` @@ -25,10 +18,15 @@ Release Notes * Deleted ``drop_nan_target_rows`` utility method :pr:`2737` * Removed default logging setup and debugging log file :pr:`2645` * Changed the default n_jobs value for ``XGBoostClassifier`` and ``XGBoostRegressor`` to 12 :pr:`2757` + * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` + * Added ``X_train`` and ``y_train`` as optional parameters to pipeline ``predict``, ``predict_proba``. Only used for time series pipelines :pr:`2697` + * Added ``training_data`` and ``training_target`` as optional parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` to support time series pipelines :pr:`2697` + * Changed time series pipeline predictions to no longer output series/dataframes padded with NaNs. A prediction will be returned for every row in the `X` input :pr:`2697` * Documentation Changes * Specified installation steps for Prophet :pr:`2713` * Added documentation for data exploration on data check actions :pr:`2696` * Added docstring linting package ``pydocstyle`` and rule to `make-lint` command :pr:`2670` + * Added a user guide entry for time series modelling :pr:`2697` * Testing Changes * Fixed flaky ``TargetDistributionDataCheck`` test for very_lognormal distribution :pr:`2748` @@ -36,6 +34,11 @@ Release Notes **Breaking Changes** * Removed default logging setup and debugging log file :pr:`2645` + * Added ``X_train`` and ``y_train`` to ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` :pr:`2762` + * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` + * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` + * Added ``X_train`` and ``y_train`` as required parameters for ``predict`` and ``predict_proba`` in time series pipelines :pr:`2697` + * Added ``training_data`` and ``training_target`` as required parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` for time series pipelines :pr:`2697` **v0.32.0 Aug. 31, 2021** * Enhancements @@ -48,8 +51,6 @@ Release Notes * Updated pipeline ``graph()`` to distingush X and y edges :pr:`2654` * Added ``DropRowsTransformer`` component :pr:`2692` * Added ``DROP_ROWS`` to ``_make_component_list_from_actions`` and clean up metadata :pr:`2694` - * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` - * Added ``predict_in_sample`` and ``predict_proba_in_sample`` methods to time series pipelines to predict on data where the target is known, e.g. cross-validation :pr:`2697` * Fixes * Updated Oversampler logic to select best SMOTE based on component input instead of pipeline input :pr:`2695` * Added ability to explicitly close DaskEngine resources to improve runtime and reduce Dask warnings :pr:`2667` @@ -58,14 +59,9 @@ Release Notes * Changes * Replaced ``SMOTEOversampler``, ``SMOTENOversampler`` and ``SMOTENCOversampler`` with consolidated ``Oversampler`` component :pr:`2695` * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` - * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` - * Added ``X_train`` and ``y_train`` as optional parameters to pipeline ``predict``, ``predict_proba``. Only used for time series pipelines :pr:`2697` - * Added ``training_data`` and ``training_target`` as optional parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` to support time series pipelines :pr:`2697` - * Changed time series pipeline predictions to no longer output series/dataframes padded with NaNs. A prediction will be returned for every row in the `X` input :pr:`2697` * Documentation Changes * Added user guide documentation for using ``ComponentGraph`` and added ``ComponentGraph`` to API reference :pr:`2673` * Updated documentation to make parallelization of AutoML clearer :pr:`2667` - * Added a user guide entry for time series modelling :pr:`2697` * Testing Changes * Removes the process-level parallelism from the ``test_cancel_job`` test :pr:`2666` * Installed numba 0.53 in windows CI to prevent problems installing version 0.54 :pr:`2710` @@ -76,11 +72,6 @@ Release Notes * Renamed the current top level ``search`` method to ``search_iterative`` and defined a new ``search`` method for the ``DefaultAlgorithm`` :pr:`2634` * Replaced ``SMOTEOversampler``, ``SMOTENOversampler`` and ``SMOTENCOversampler`` with consolidated ``Oversampler`` component :pr:`2695` * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` - * Added ``forecast_horizon`` as a required parameter to time series pipelines and ``AutoMLSearch`` :pr:`2697` - * Changed ``TimeSeriesBaselineEstimator`` to only work on a time series pipeline with a ``DelayedFeaturesTransformer`` :pr:`2697` - * Added ``X_train`` and ``y_train`` as required parameters for ``predict`` and ``predict_proba`` in time series pipelines :pr:`2697` - * Added ``training_data`` and ``training_target`` as required parameters to ``explain_predictions`` and ``explain_predictions_best_worst`` for time series pipelines :pr:`2697` - **v0.31.0 Aug. 19, 2021** * Enhancements diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb index d795552e7d..3b2364e858 100644 --- a/docs/source/user_guide/timeseries.ipynb +++ b/docs/source/user_guide/timeseries.ipynb @@ -274,41 +274,10 @@ "metadata": {}, "outputs": [], "source": [ - "data = pd.DataFrame({\"dates\": X_test.Date,\n", - " \"prediction\": pl.predict_in_sample(X_test, y_test, X_train, y_train),\n", - " \"target\": y_test})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = [\n", - " go.Scatter(\n", - " x=data[\"dates\"],\n", - " y=data[\"target\"],\n", - " mode=\"lines+markers\",\n", - " name=\"Target\",\n", - " line=dict(color=\"#1f77b4\"),\n", - " ),\n", - " go.Scatter(\n", - " x=data[\"dates\"],\n", - " y=data[\"prediction\"],\n", - " mode=\"lines+markers\",\n", - " name=\"Prediction\",\n", - " line=dict(color=\"#d62728\"),\n", - " ),\n", - "]\n", - "# Let plotly pick the best date format.\n", - "layout = go.Layout(\n", - " title={\"text\": \"Prediction vs Target over time\"},\n", - " xaxis={\"title\": \"Time\"},\n", - " yaxis={\"title\": \"Target Values and Predictions\"},\n", - ")\n", + "from evalml.model_understanding import graph_prediction_vs_actual_over_time\n", "\n", - "go.Figure(data=data, layout=layout)" + "fig = graph_prediction_vs_actual_over_time(pl, X_test, y_test, X_train, y_train, dates=X_test['Date'])\n", + "fig" ] }, { diff --git a/evalml/__init__.py b/evalml/__init__.py index 7c99ae80ed..3b2de0974e 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -22,4 +22,4 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) -__version__ = "0.32.1" +__version__ = "0.32.0" diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 6e980b1a9a..bf5bfa3da9 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -624,6 +624,8 @@ def partial_dependence( feature value pair. Raises: + ValueError: Error during call to scikit-learn's partial dependence method. + Exception: All other errors during calculation. PartialDependenceError: if the user provides a tuple of not exactly two features. PartialDependenceError: if the provided pipeline isn't fitted. PartialDependenceError: if the provided pipeline is a Baseline pipeline. @@ -951,9 +953,9 @@ def graph_partial_dependence( in the x-axis and the partial dependence in the z-axis. Args: - pipeline (PipelineBase or subclass): Fitted pipeline + pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame, np.ndarray): The input data used to generate a grid of values - for feature where partial dependence will be calculated at + for feature where partial dependence will be calculated at. features (int, string, tuple[int or string]): The target feature for which to create the partial dependence plot for. If features is an int, it must be the index of the feature to use. If features is a string, it must be a valid column name in X. @@ -962,8 +964,8 @@ def graph_partial_dependence( the partial dependence for each class. This argument does not change behavior for regression or binary classification pipelines. For binary classification, the partial dependence for the positive label will always be displayed. Defaults to None. - grid_resolution (int): Number of samples of feature(s) for partial dependence plot - kind {'average', 'individual', 'both'}: Type of partial dependence to plot. 'average' creates a regular partial dependence + grid_resolution (int): Number of samples of feature(s) for partial dependence plot. + kind ({'average', 'individual', 'both'}): Type of partial dependence to plot. 'average' creates a regular partial dependence (PD) graph, 'individual' creates an individual conditional expectation (ICE) plot, and 'both' creates a single-figure PD and ICE plot. ICE plots can only be shown for one-way partial dependence plots. @@ -1206,6 +1208,8 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): * `actual`: Real target values. * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value. + Raises: + ValueError: If threshold is not positive. """ if outlier_threshold and outlier_threshold <= 0: raise ValueError( @@ -1243,6 +1247,8 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): Returns: plotly.Figure representing the predicted vs. actual values graph + Raises: + ValueError: If threshold is not positive. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1322,6 +1328,10 @@ def decision_tree_data_from_estimator(estimator): Returns: OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1344,7 +1354,11 @@ def decision_tree_data_from_pipeline(pipeline_): pipeline_ (PipelineBase): A pipeline with a DecisionTree-based estimator. Returns: - OrderedDict: An OrderedDict of OrderedDicts describing a tree structure + OrderedDict: An OrderedDict of OrderedDicts describing a tree structure. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1378,6 +1392,10 @@ def visualize_decision_tree( Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. + + Raises: + ValueError: If estimator is not a decision tree-based estimator. + NotFittedError: If estimator is not yet fitted. """ if not estimator.model_family == ModelFamily.DECISION_TREE: raise ValueError( @@ -1439,21 +1457,22 @@ def visualize_decision_tree( return source_obj -def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): +def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, dates): """Get the data needed for the prediction_vs_actual_over_time plot. Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. + X_train (pd.DataFrame): Data the pipeline was trained on. + y_train (pd.Series): Target values for training data. dates (pd.Series): Dates corresponding to target values and predictions. Returns: - pd.DataFrame + pd.DataFrame: Predictions vs. time. """ dates = infer_feature_types(dates) - y = infer_feature_types(y) - prediction = pipeline.predict(X, y) + prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train) return pd.DataFrame( { @@ -1464,17 +1483,22 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): ) -def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): +def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates): """Plot the target values and predictions against time on the x-axis. Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. X (pd.DataFrame): Features used to generate new predictions. y (pd.Series): Target values to compare predictions against. + X_train (pd.DataFrame): Data the pipeline was trained on. + y_train (pd.Series): Target values for training data. dates (pd.Series): Dates corresponding to target values and predictions. Returns: plotly.Figure: Showing the prediction vs actual over time. + + Raises: + ValueError: If the pipeline is not a time-series regression pipeline. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" @@ -1486,7 +1510,9 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): f"Received {str(pipeline.problem_type)}." ) - data = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) + data = get_prediction_vs_actual_over_time_data( + pipeline, X, y, X_train, y_train, dates + ) data = [ _go.Scatter( @@ -1523,6 +1549,10 @@ def get_linear_coefficients(estimator, features=None): Returns: pd.DataFrame: Displaying the features by importance. + + Raises: + ValueError: If the model is not a linear model. + NotFittedError: If the model is not yet fitted. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: raise ValueError( @@ -1556,14 +1586,16 @@ def t_sne( Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + **kwargs: Additional abritrary parameters. Returns: - np.ndarray (n_samples, n_components) + np.ndarray (n_samples, n_components): TSNE output. + + Raises: + ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1594,7 +1626,7 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE . + """Plot high dimensional data into lower dimensional space using t-SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. @@ -1608,8 +1640,10 @@ def graph_t_sne( marker_size (int, optional): Determines the size of the marker. Returns: - plotly.Figure representing the transformed data + plotly.Figure: Figure representing the transformed data. + Raises: + ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index 63a65fe08b..2ae64db1be 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -53,12 +53,12 @@ def _make_rows( display_text = symbol * min(int(abs(value) // 0.2) + 1, 5) # At this point, the feature is either in the original data or the data - # the final estimator sees. So if it is not a pipeline feature, it is - # an original feature - if feature_name in pipeline_features.columns: - feature_value = pipeline_features[feature_name].iloc[0] - else: + # the final estimator sees, or both. We use the original feature value if possible + is_original_feature = feature_name in original_features.columns + if is_original_feature: feature_value = original_features[feature_name].iloc[0] + else: + feature_value = pipeline_features[feature_name].iloc[0] if convert_numeric_to_string: if pd.api.types.is_number(feature_value) and not pd.api.types.is_bool( diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index e113b73321..6a91ae6ff7 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -1,6 +1,6 @@ catboost==0.26.1 click==8.0.1 -cloudpickle==1.6.0 +cloudpickle==2.0.0 colorama==0.4.4 dask==2021.9.0 featuretools==0.27.1 diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 373b1a220e..149e64337d 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1782,3 +1782,111 @@ def test_explain_predictions_url_email(df_with_url_and_email): .isnull() .any() ) + + +@pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) +def test_explain_predictions_report_shows_original_value_if_possible( + pipeline_class, estimator, fraud_100 +): + X, y = fraud_100 + X.ww.set_types({"country": "NaturalLanguage"}) + component_graph = [ + "Imputer", + "DateTime Featurization Component", + "Text Featurization Component", + "One Hot Encoder", + "Standard Scaler", + estimator, + ] + parameters = { + estimator: {"n_jobs": 1}, + } + pipeline = pipeline_class(component_graph=component_graph, parameters=parameters) + + y = transform_y_for_problem_type(pipeline.problem_type, y) + + pipeline.fit(X, y) + + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=20 + ) + expected_feature_values = set(X.ww.iloc[0, :].tolist()) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == set(X.columns) + assert set(explanation["feature_values"]) == expected_feature_values + + X_null = X.ww.copy() + X_null.loc[0, "lat"] = None + X_null.ww.init(schema=X.ww.schema) + + report = explain_predictions( + pipeline, + X_null, + y, + indices_to_explain=[0], + output_format="dict", + top_k_features=20, + ) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == set(X.columns) + for feature_name, feature_value in zip( + explanation["feature_names"], explanation["feature_values"] + ): + if feature_name == "lat": + assert np.isnan(feature_value) + + +def test_explain_predictions_best_worst_report_shows_original_value_if_possible( + fraud_100, +): + X, y = fraud_100 + X.ww.set_types({"country": "NaturalLanguage"}) + component_graph = [ + "Imputer", + "DateTime Featurization Component", + "Text Featurization Component", + "One Hot Encoder", + "Standard Scaler", + "Random Forest Classifier", + ] + parameters = { + "Random Forest Classifier": {"n_jobs": 1}, + } + pipeline = BinaryClassificationPipeline( + component_graph=component_graph, parameters=parameters + ) + + y = transform_y_for_problem_type(pipeline.problem_type, y) + + pipeline.fit(X, y) + report = explain_predictions_best_worst( + pipeline, X, y, num_to_explain=1, output_format="dict", top_k_features=20 + ) + + for index, explanation in enumerate(report["explanations"]): + for exp in explanation["explanations"]: + assert set(exp["feature_names"]) == set(X.columns) + assert set(exp["feature_values"]) == set( + X.ww.iloc[explanation["predicted_values"]["index_id"], :] + ) + + X_null = X.ww.copy() + X_null.loc[0:2, "lat"] = None + X_null.ww.init(schema=X.ww.schema) + + report = explain_predictions_best_worst( + pipeline, + X_null.ww.iloc[:2], + y.ww.iloc[:2], + num_to_explain=1, + output_format="dict", + top_k_features=20, + ) + for explanation in report["explanations"]: + for exp in explanation["explanations"]: + assert set(exp["feature_names"]) == set(X.columns) + for feature_name, feature_value in zip( + exp["feature_names"], exp["feature_values"] + ): + if feature_name == "lat": + assert np.isnan(feature_value) diff --git a/evalml/tests/model_understanding_tests/test_graphs.py b/evalml/tests/model_understanding_tests/test_graphs.py index 78029c0c2e..fb4071ca20 100644 --- a/evalml/tests/model_understanding_tests/test_graphs.py +++ b/evalml/tests/model_understanding_tests/test_graphs.py @@ -43,6 +43,7 @@ LinearRegressor, MulticlassClassificationPipeline, RegressionPipeline, + TimeSeriesRegressionPipeline, ) from evalml.problem_types import ProblemTypes from evalml.utils import get_random_state, infer_feature_types @@ -994,46 +995,56 @@ def test_graph_prediction_vs_actual(data_type): assert fig_dict["data"][2]["name"] == ">= outlier_threshold" -@patch("evalml.pipelines.ClassificationPipeline.predict") -@pytest.mark.parametrize("data_type", ["pd", "ww"]) -def test_get_prediction_vs_actual_over_time_data( - mock_predict, data_type, logistic_regression_binary_pipeline_class, make_data_type -): - mock_predict.return_value = pd.Series([0] * 20) - X = make_data_type(data_type, pd.DataFrame()) - y = make_data_type(data_type, pd.Series([0] * 20)) - dates = make_data_type( - data_type, pd.Series(pd.date_range("2000-05-19", periods=20, freq="D")) +def test_get_prediction_vs_actual_over_time_data(ts_data): + X, y = ts_data + X_train, y_train = X.iloc[:15], y.iloc[:15] + X_test, y_test = X.iloc[15:], y.iloc[15:] + + pipeline = TimeSeriesRegressionPipeline( + ["Elastic Net Regressor"], + parameters={ + "pipeline": { + "gap": 0, + "max_delay": 2, + "forecast_horizon": 1, + "date_index": None, + } + }, ) - pipeline = logistic_regression_binary_pipeline_class(parameters={}) - results = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) + pipeline.fit(X_train, y_train) + results = get_prediction_vs_actual_over_time_data( + pipeline, X_test, y_test, X_train, y_train, pd.Series(X_test.index) + ) assert isinstance(results, pd.DataFrame) assert list(results.columns) == ["dates", "target", "prediction"] -def test_graph_prediction_vs_actual_over_time(): +def test_graph_prediction_vs_actual_over_time(ts_data): go = pytest.importorskip( "plotly.graph_objects", reason="Skipping plotting test because plotly not installed", ) - class MockPipeline: - problem_type = ProblemTypes.TIME_SERIES_REGRESSION - - def predict(self, X, y): - y = infer_feature_types(y) - preds = y + 10 - preds.index = range(100, 161) - return preds + X, y = ts_data + X_train, y_train = X.iloc[:15], y.iloc[:15] + X_test, y_test = X.iloc[15:], y.iloc[15:] - y = pd.Series(np.arange(61), index=range(200, 261)) - dates = pd.Series(pd.date_range("2020-03-01", "2020-04-30")) - pipeline = MockPipeline() + pipeline = TimeSeriesRegressionPipeline( + ["Elastic Net Regressor"], + parameters={ + "pipeline": { + "gap": 0, + "max_delay": 2, + "forecast_horizon": 1, + "date_index": None, + } + }, + ) + pipeline.fit(X_train, y_train) - # For this test it doesn't matter what the features are fig = graph_prediction_vs_actual_over_time( - pipeline, X=pd.DataFrame(), y=y, dates=dates + pipeline, X_test, y_test, X_train, y_train, pd.Series(X_test.index) ) assert isinstance(fig, go.Figure) @@ -1045,12 +1056,12 @@ def predict(self, X, y): ) assert len(fig_dict["data"]) == 2 assert fig_dict["data"][0]["line"]["color"] == "#1f77b4" - assert len(fig_dict["data"][0]["x"]) == 61 + assert len(fig_dict["data"][0]["x"]) == X_test.shape[0] assert not np.isnan(fig_dict["data"][0]["y"]).all() - assert len(fig_dict["data"][0]["y"]) == 61 + assert len(fig_dict["data"][0]["y"]) == X_test.shape[0] assert fig_dict["data"][1]["line"]["color"] == "#d62728" - assert len(fig_dict["data"][1]["x"]) == 61 - assert len(fig_dict["data"][1]["y"]) == 61 + assert len(fig_dict["data"][1]["x"]) == X_test.shape[0] + assert len(fig_dict["data"][1]["y"]) == X_test.shape[0] assert not np.isnan(fig_dict["data"][1]["y"]).all() @@ -1065,7 +1076,9 @@ class NotTSPipeline: error_msg = "graph_prediction_vs_actual_over_time only supports time series regression pipelines! Received regression." with pytest.raises(ValueError, match=error_msg): - graph_prediction_vs_actual_over_time(NotTSPipeline(), None, None, None) + graph_prediction_vs_actual_over_time( + NotTSPipeline(), None, None, None, None, None + ) def test_decision_tree_data_from_estimator_not_fitted(tree_estimators): diff --git a/setup.py b/setup.py index 1a3a547f8c..05f232b794 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='evalml', - version='0.32.1', + version='0.32.0', author='Alteryx, Inc.', author_email='support@featurelabs.com', description='EvalML is an AutoML library that builds, optimizes, and evaluates machine learning pipelines using domain-specific objective functions.', From f66fcc52acaf70ac7339161890b65869ffac9481 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 23:30:35 -0400 Subject: [PATCH 49/62] ??? --- evalml/model_understanding/graphs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index bf5bfa3da9..f32a98bd71 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1589,7 +1589,6 @@ def t_sne( perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Additional abritrary parameters. Returns: np.ndarray (n_samples, n_components): TSNE output. From 11cd30e71f136ecf80e0108ff22cf8eb4667b9c8 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 10 Sep 2021 23:53:13 -0400 Subject: [PATCH 50/62] change indentation --- evalml/model_understanding/graphs.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index f32a98bd71..766aecfb78 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1382,13 +1382,10 @@ def visualize_decision_tree( Args: estimator (ComponentBase): A fitted DecisionTree-based estimator. - max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), - tree is fully generated. + max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default), tree is fully generated. rotate (bool, optional): Orient tree left to right rather than top-down. - filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for - regression, or purity of node for multi-output. - filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph - will not be saved. + filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. + filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved. Returns: graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. @@ -1630,10 +1627,8 @@ def graph_t_sne( Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. - perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning - algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad - local minimum, increasing the learning rate may help. + perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. From 3c889a8c5a1d69235b9ebf2bab97a8c5064c53aa Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 02:01:02 -0400 Subject: [PATCH 51/62] try --- evalml/model_understanding/graphs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 766aecfb78..3d11801376 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1586,6 +1586,7 @@ def t_sne( perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + **kwargs: Additional keyword arguments. Returns: np.ndarray (n_samples, n_components): TSNE output. @@ -1632,6 +1633,7 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. + **kwargs: Additional keyword arguments. Returns: plotly.Figure: Figure representing the transformed data. From 6720618cbf8fc7869a33e6d4ceb37de5ebc6eea0 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 02:20:36 -0400 Subject: [PATCH 52/62] try again --- evalml/model_understanding/graphs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 3d11801376..4686cf2884 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1578,7 +1578,7 @@ def t_sne( metric="euclidean", **kwargs, ): - """Get the transformed output after fitting X to the embedded space using t-SNE. + """Get the transformed output after fitting X to the embedded space using t SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. @@ -1586,7 +1586,7 @@ def t_sne( perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Additional keyword arguments. + **kwargs: Arbitrary keyword arguments. Returns: np.ndarray (n_samples, n_components): TSNE output. @@ -1623,7 +1623,7 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t-SNE. + """Plot high dimensional data into lower dimensional space using t SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. @@ -1633,7 +1633,7 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - **kwargs: Additional keyword arguments. + **kwargs: Arbitrary keyword arguments. Returns: plotly.Figure: Figure representing the transformed data. From 39ca658ce8d47d0fb251036808c77a5ba6a20a1b Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 08:48:00 -0400 Subject: [PATCH 53/62] test square brackets --- evalml/model_understanding/graphs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 4686cf2884..b3df98aede 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1584,7 +1584,6 @@ def t_sne( X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. **kwargs: Arbitrary keyword arguments. @@ -1629,7 +1628,6 @@ def graph_t_sne( X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. From 9b4e2cc81719c2bcaa030d95a78e7cd34b39a6a7 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 09:29:08 -0400 Subject: [PATCH 54/62] test underscore --- evalml/model_understanding/graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index b3df98aede..9f3b3645af 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1637,7 +1637,7 @@ def graph_t_sne( plotly.Figure: Figure representing the transformed data. Raises: - ValueError: If marker_line_width or marker_size are not valid values. + ValueError: If marker line width or marker size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" From 2a166d8011ee3f57b47163ca1a7e5b28c887a861 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 09:42:53 -0400 Subject: [PATCH 55/62] remove raises --- evalml/model_understanding/graphs.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 9f3b3645af..0cf6724a83 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1589,9 +1589,6 @@ def t_sne( Returns: np.ndarray (n_samples, n_components): TSNE output. - - Raises: - ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1635,9 +1632,6 @@ def graph_t_sne( Returns: plotly.Figure: Figure representing the transformed data. - - Raises: - ValueError: If marker line width or marker size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" From 9a98ba8c46c2c9647f6badf5367c6869320ca09a Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 10:00:21 -0400 Subject: [PATCH 56/62] backslash --- evalml/model_understanding/graphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 0cf6724a83..60cf5717a2 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1585,7 +1585,7 @@ def t_sne( n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - **kwargs: Arbitrary keyword arguments. + *\*\kwargs: Arbitrary keyword arguments. Returns: np.ndarray (n_samples, n_components): TSNE output. @@ -1628,7 +1628,7 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - **kwargs: Arbitrary keyword arguments. + \*\*kwargs: Arbitrary keyword arguments. Returns: plotly.Figure: Figure representing the transformed data. From 32d5cc13b2f94b63ee8456871926983c38374d0e Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 10:20:12 -0400 Subject: [PATCH 57/62] try removal --- evalml/model_understanding/graphs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 60cf5717a2..1a4d2b81bb 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1585,7 +1585,6 @@ def t_sne( n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. metric (str, optional): The metric to use when calculating distance between instances in a feature array. - *\*\kwargs: Arbitrary keyword arguments. Returns: np.ndarray (n_samples, n_components): TSNE output. @@ -1628,7 +1627,6 @@ def graph_t_sne( metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. - \*\*kwargs: Arbitrary keyword arguments. Returns: plotly.Figure: Figure representing the transformed data. From 4fc6db7c4db93968c156a6f7152afda427836b92 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sat, 11 Sep 2021 14:46:08 -0400 Subject: [PATCH 58/62] test --- evalml/model_understanding/graphs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 1a4d2b81bb..ec1c434e7f 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -1578,16 +1578,21 @@ def t_sne( metric="euclidean", **kwargs, ): - """Get the transformed output after fitting X to the embedded space using t SNE. + """Get the transformed output after fitting X to the embedded space using t-SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. + kwargs: Arbitrary keyword arguments. Returns: np.ndarray (n_samples, n_components): TSNE output. + + Raises: + ValueError: If specified parameters are not valid values. """ if not isinstance(n_components, int) or not n_components > 0: raise ValueError( @@ -1618,18 +1623,23 @@ def graph_t_sne( marker_size=7, **kwargs, ): - """Plot high dimensional data into lower dimensional space using t SNE. + """Plot high dimensional data into lower dimensional space using t-SNE. Args: X (np.ndarray, pd.DataFrame): Data to be transformed. Must be numeric. n_components (int, optional): Dimension of the embedded space. perplexity (float, optional): Related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + learning_rate (float, optional): Usually in the range [10.0, 1000.0]. If the cost function gets stuck in a bad local minimum, increasing the learning rate may help. metric (str, optional): The metric to use when calculating distance between instances in a feature array. marker_line_width (int, optional): Determines the line width of the marker boundary. marker_size (int, optional): Determines the size of the marker. + kwargs: Arbitrary keyword arguments. Returns: plotly.Figure: Figure representing the transformed data. + + Raises: + ValueError: If marker_line_width or marker_size are not valid values. """ _go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" From eea28fcf5bcade7e5f9f378517af5fa1a4e4399f Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sun, 12 Sep 2021 16:07:08 -0400 Subject: [PATCH 59/62] clean up setup --- Makefile | 2 +- setup.cfg | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2523e791b7..9dc81ad536 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ lint: python docs/notebook_version_standardizer.py check-versions black evalml -t py39 --check pydocstyle evalml/ --convention=google --add-ignore=D107 --add-select=D400 --match-dir='^(?!(tests)).*' - find evalml -type f -not -path "evalml/tests/*" -a -name "*.py" | xargs flake8 + flake8 evalml .PHONY: lint-fix lint-fix: diff --git a/setup.cfg b/setup.cfg index 6d47d023ae..44da5cd798 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,7 @@ exclude = docs/* ignore = E501,W504,W503 per-file-ignores = **/__init__.py:F401 + **/tests/*:D [metadata] description-file = README.md [aliases] From 57941fc47ad635cabf8d77a3e9b1ff77a87b5bad Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Sun, 12 Sep 2021 22:53:54 -0400 Subject: [PATCH 60/62] cleanup and revert accidental merge diffs --- docs/source/release_notes.rst | 16 ++++++++++++++-- evalml/__init__.py | 2 +- evalml/model_understanding/graphs.py | 2 +- evalml/problem_types/utils.py | 2 +- setup.py | 2 +- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 2ad1326f38..ac8666237b 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -1,6 +1,19 @@ Release Notes ------------- **Future Releases** + * Enhancements + * Fixes + * Changes + * Added docstring linting packages ``pydocstyle`` and ``darglint`` to `make-lint` command :pr:`2670` + * Documentation Changes + * Testing Changes + +.. warning:: + + **Breaking Changes** + + +**v0.32.1 Sep. 10, 2021** * Enhancements * Added ``verbose`` flag to ``AutoMLSearch`` to run search in silent mode by default :pr:`2645` * Added label encoder to ``XGBoostClassifier`` to remove the warning :pr:`2701` @@ -24,8 +37,7 @@ Release Notes * Changed time series pipeline predictions to no longer output series/dataframes padded with NaNs. A prediction will be returned for every row in the `X` input :pr:`2697` * Documentation Changes * Specified installation steps for Prophet :pr:`2713` - * Added documentation for data exploration on data check actions :pr:`2696` - * Added docstring linting package ``pydocstyle`` and rule to `make-lint` command :pr:`2670` + * Added documentation for data exploration on data check actions :pr:`2696` * Added a user guide entry for time series modelling :pr:`2697` * Testing Changes * Fixed flaky ``TargetDistributionDataCheck`` test for very_lognormal distribution :pr:`2748` diff --git a/evalml/__init__.py b/evalml/__init__.py index 3b2de0974e..7c99ae80ed 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -22,4 +22,4 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) -__version__ = "0.32.0" +__version__ = "0.32.1" diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index ec1c434e7f..6f6f778c63 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -518,7 +518,7 @@ def _is_feature_of_type(feature, X, ltype): def _put_categorical_feature_first(features, first_feature_categorical): - """If the user is doing a two-way partial dependence plot and one of the features is categorical, we need to make sure the categorical feature is the first element in the tuple that's passed to sklearn. + """If the user is doing a two-way partial dependence plot and one of the features is categorical, we need to ensure the categorical feature is the first element in the tuple that's passed to sklearn. This is because in the two-way grid calculation, sklearn will try to coerce every element of the grid to the type of the first feature in the tuple. If we put the categorical feature first, the grid will be of type 'object' diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 26926931b1..7a08f5808e 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -32,7 +32,7 @@ def handle_problem_types(problem_type): def detect_problem_type(y): - """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression) Ignores missing and null data. + """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression). Ignores missing and null data. Args: y (pd.Series): The target labels to predict. diff --git a/setup.py b/setup.py index 05f232b794..1a3a547f8c 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='evalml', - version='0.32.0', + version='0.32.1', author='Alteryx, Inc.', author_email='support@featurelabs.com', description='EvalML is an AutoML library that builds, optimizes, and evaluates machine learning pipelines using domain-specific objective functions.', From a81853699d8a6ccd8a7551898e4e509fe5801c8e Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 14 Sep 2021 15:48:46 -0400 Subject: [PATCH 61/62] clean up from comments --- contributing.md | 6 +++++- evalml/automl/pipeline_search_plots.py | 3 +++ evalml/data_checks/class_imbalance_data_check.py | 4 +++- evalml/data_checks/outliers_data_check.py | 16 +++++++++++++++- .../ensemble/sklearn_stacked_ensemble_base.py | 5 +++++ .../transformers/samplers/base_sampler.py | 5 ++++- .../time_series_classification_pipelines.py | 2 +- 7 files changed, 36 insertions(+), 5 deletions(-) diff --git a/contributing.md b/contributing.md index 33bd10800d..6e7458091c 100644 --- a/contributing.md +++ b/contributing.md @@ -119,7 +119,11 @@ One of the package maintainers will then review your PR! * Make PRs as small as possible! Consider breaking your large changes into separate PRs. This will make code review easier, quicker, less bug-prone and more effective. * In the name of every branch you create, include the associated issue number if applicable. * If new changes are added to the branch you're basing your changes off of, consider using `git rebase -i base_branch` rather than merging the base branch, to keep history clean. -* Always include a docstring for public methods and classes. Consider including docstrings for private methods too. Our docstring convention is [`sphinx.ext.napoleon`](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html). +* Always include a docstring for public methods and classes. Consider including docstrings for private methods too. We use the [Google docstring convention](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings), and use the [`sphinx.ext.napoleon`](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) extension to parse our docstrings. +* Although not explicitly enforced by the Google convention, keep the following stylistic conventions for docstrings in mind: + - First letter of each argument description should be capitalized. + - Docstring sentences should end in periods. This includes descriptions for each argument. + - Types should be written in lower-case. For example, use "bool" instead of "Bool". * Use [PascalCase (upper camel case)](https://en.wikipedia.org/wiki/Camel_case#Variations_and_synonyms) for class names, and [snake_case](https://en.wikipedia.org/wiki/Snake_case) for method and class member names. * To distinguish private methods and class attributes from public ones, those which are private should be prefixed with an underscore * Any code which doesn't need to be public should be private. Use `@staticmethod` and `@classmethod` where applicable, to indicate no side effects. diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index c2fafbeb59..917b25327f 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -106,6 +106,9 @@ def search_iteration_plot(self, interactive_plot=False): Returns: plot + + Raises: + ValueError: If engine_str is not a valid engine. """ if not interactive_plot: plot_obj = SearchIterationPlot(self.results, self.objective) diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 0a9410c7eb..3c29ce4fdb 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -46,7 +46,9 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): self.cv_folds = num_cv_folds * 2 def validate(self, X, y): - """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. + """Check if any target labels are imbalanced beyond a threshold for binary and multiclass problems. + + Ignores NaN values in target labels if they appear. Args: X (pd.DataFrame, np.ndarray): Features. Ignored. diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 4a7b5cfebf..666181396e 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -82,7 +82,21 @@ def validate(self, X, y=None): @staticmethod def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: - """Calculate the probability that there are no true outliers in a numeric (integer or float) column. It is based on creating 100,000 samples consisting of a given number of records, and then repeating this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, and then the number of potential outliers in the data is determined using the skew adjusted box plot approach based on the medcouple statistic. It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, with the shape and scale parameters changing with the sample size. For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum likelihood methods. The set of estimate shape and scale parameters for different sample size were then used to fit equations that relate these two parameters to the sample size. These equations use a transendental logrithmic functional form that provides a seventh order Taylor series approximation to the two true functional relationships, and was estimated using least squares regression. + """Calculate the probability that there are no true outliers in a numeric (integer or float) column. + + It is based on creating 100,000 samples consisting of a given number of records, and then repeating + this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution, + and then the number of potential outliers in the data is determined using the skew adjusted box plot + approach based on the medcouple statistic. + + It was observed that the distribution of the percentage of outliers could be described by a gamma distribution, + with the shape and scale parameters changing with the sample size. + For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum + likelihood methods. The set of estimate shape and scale parameters for different sample size were then used + to fit equations that relate these two parameters to the sample size. + + These equations use a transendental logrithmic functional form that provides a seventh order Taylor series + approximation to the two true functional relationships, and was estimated using least squares regression. Original credit goes to Jad Raad and Dan Putler of Alteryx. diff --git a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py index 79414eb9d6..682d202783 100644 --- a/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/sklearn_stacked_ensemble_base.py @@ -28,6 +28,11 @@ class SklearnStackedEnsembleBase(Estimator): Defaults to -1. - Note: there could be some multi-process errors thrown for values of `n_jobs != 1`. If this is the case, please use `n_jobs = 1`. random_seed (int): Seed for the random number generator. Defaults to 0. + + Raises: + EnsembleMissingPipelinesError: If `input_pipelines` is None or an empty list. + ValueError: If any of the input pipelines cannot be used in a stacked ensemble. + """ model_family = ModelFamily.ENSEMBLE diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index b489d4f23c..13cb43baca 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -77,7 +77,10 @@ def transform(self, X, y=None): return infer_feature_types(X_new), infer_feature_types(y_new) def _convert_dictionary(self, sampling_dict, y): - """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. + """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. + + Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. + Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. Args: sampling_dict (dict): The input sampling dictionary passed in from user. diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 086139c973..13df5ffb5f 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -223,7 +223,7 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None): y (pd.Series): Future target of shape [n_samples]. X_train (pd.DataFrame): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. y_train (pd.Series): Targets used to train the pipeline of shape [n_samples_train]. - objective (ObjectiveBase, str): Objective used to threshold predicted probabilities, optional. + objective (ObjectiveBase, str): Objective used to threshold predicted probabilities, optional. Defaults to None. Returns: pd.Series: Estimated labels. From f0bc3dba27d720e96f1b1f132742d58fd1e3d705 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 14 Sep 2021 16:04:01 -0400 Subject: [PATCH 62/62] add default to contributing --- contributing.md | 1 + 1 file changed, 1 insertion(+) diff --git a/contributing.md b/contributing.md index 6e7458091c..715c49dfe1 100644 --- a/contributing.md +++ b/contributing.md @@ -124,6 +124,7 @@ One of the package maintainers will then review your PR! - First letter of each argument description should be capitalized. - Docstring sentences should end in periods. This includes descriptions for each argument. - Types should be written in lower-case. For example, use "bool" instead of "Bool". + - Always add the default value in the description of the argument, if applicable. For example, "Defaults to 1." * Use [PascalCase (upper camel case)](https://en.wikipedia.org/wiki/Camel_case#Variations_and_synonyms) for class names, and [snake_case](https://en.wikipedia.org/wiki/Snake_case) for method and class member names. * To distinguish private methods and class attributes from public ones, those which are private should be prefixed with an underscore * Any code which doesn't need to be public should be private. Use `@staticmethod` and `@classmethod` where applicable, to indicate no side effects.