diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index f5c8281b1b..f906e33abe 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -20,7 +20,6 @@ Demo Datasets .. currentmodule:: evalml.preprocessing - Preprocessing ============= @@ -31,7 +30,6 @@ Preprocessing load_data split_data - detect_label_leakage .. currentmodule:: evalml @@ -57,6 +55,7 @@ Model Types list_model_types + .. currentmodule:: evalml.pipelines Pipelines @@ -76,11 +75,10 @@ Pipelines RFRegressionPipeline -Objective Functions -==================== - .. currentmodule:: evalml.objectives +Objective Functions +==================== Domain Specific ~~~~~~~~~~~~~~~ @@ -131,6 +129,7 @@ Regression R2 + .. currentmodule:: evalml.problem_types Problem Types @@ -144,6 +143,7 @@ Problem Types ProblemTypes handle_problem_types + .. currentmodule:: evalml.tuners Tuners @@ -157,4 +157,15 @@ Tuners SKOptTuner +.. currentmodule:: evalml.guardrails + +Guardrails +============= +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + detect_highly_null + detect_label_leakage diff --git a/evalml/__init__.py b/evalml/__init__.py index 0c21ee207e..8138de8487 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -18,6 +18,7 @@ import evalml.pipelines import evalml.model_types import evalml.utils +import evalml.guardrails from evalml.pipelines import list_model_types, save_pipeline, load_pipeline from evalml.models import AutoClassifier, AutoRegressor diff --git a/evalml/guardrails/__init__.py b/evalml/guardrails/__init__.py new file mode 100644 index 0000000000..a74e291974 --- /dev/null +++ b/evalml/guardrails/__init__.py @@ -0,0 +1,2 @@ +# flake8:noqa +from .utils import * diff --git a/evalml/guardrails/utils.py b/evalml/guardrails/utils.py new file mode 100644 index 0000000000..b51a58855f --- /dev/null +++ b/evalml/guardrails/utils.py @@ -0,0 +1,45 @@ +import pandas as pd + + +def detect_label_leakage(X, y, threshold=.95): + """Check if any of the features are highly correlated with the target. + + Currently only supports binary and numeric targets and features + + Args: + X (pd.DataFrame): The input features to check + y (pd.Series): the labels + threshold (float): the correlation threshold to be considered leakage. Defaults to .95 + + Returns: + leakage, dictionary of features with leakage and corresponding threshold + """ + + # only select numeric + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool'] + X = X.select_dtypes(include=numerics) + + if len(X.columns) == 0: + return {} + + corrs = X.corrwith(y).abs() + out = corrs[corrs >= threshold] + return out.to_dict() + + +def detect_highly_null(X, percent_threshold=.95): + """ Checks if there are any highly-null columns in a dataframe. + + Args: + X (DataFrame) : features + percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95 + + Returns: + A dictionary of features with column name or index and their percentage of null values + """ + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + + percent_null = (X.isnull().mean()).to_dict() + highly_null_cols = {key: value for key, value in percent_null.items() if value >= percent_threshold} + return highly_null_cols diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 80ba6e9e12..69e76b4be2 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -7,7 +7,7 @@ import pandas as pd from tqdm import tqdm -from evalml import preprocessing +from evalml import guardrails from evalml.objectives import get_objective, get_objectives from evalml.pipelines import get_pipelines from evalml.problem_types import ProblemTypes @@ -18,7 +18,7 @@ class AutoBase: def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, model_types, detect_label_leakage, start_iteration_callback, - add_result_callback, additional_objectives, random_state, verbose): + add_result_callback, additional_objectives, null_threshold, random_state, verbose): if tuner is None: tuner = SKOptTuner self.objective = get_objective(objective) @@ -29,9 +29,9 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, self.start_iteration_callback = start_iteration_callback self.add_result_callback = add_result_callback self.cv = cv + self.null_threshold = null_threshold self.verbose = verbose self.logger = Logger(self.verbose) - self.possible_pipelines = get_pipelines(problem_type=self.problem_type, model_types=model_types) self.objective = get_objective(objective) @@ -114,11 +114,16 @@ def fit(self, X, y, feature_types=None, raise_errors=False): self.logger.log("Possible model types: %s\n" % ", ".join([model.value for model in self.possible_model_types])) if self.detect_label_leakage: - leaked = preprocessing.detect_label_leakage(X, y) + leaked = guardrails.detect_label_leakage(X, y) if len(leaked) > 0: leaked = [str(k) for k in leaked.keys()] self.logger.log("WARNING: Possible label leakage: %s" % ", ".join(leaked)) + if self.null_threshold is not None: + highly_null_columns = guardrails.detect_highly_null(X, percent_threshold=self.null_threshold) + if len(highly_null_columns) > 0: + self.logger.log("WARNING: {} columns are at least {}% null.".format(', '.join(highly_null_columns), self.null_threshold * 100)) + pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc} {percentage:3.0f}%|{bar}| Elapsed:{elapsed}') start = time.time() for n in pbar: diff --git a/evalml/models/auto_classifier.py b/evalml/models/auto_classifier.py index d8da1d89c2..93048229f9 100644 --- a/evalml/models/auto_classifier.py +++ b/evalml/models/auto_classifier.py @@ -22,6 +22,7 @@ def __init__(self, start_iteration_callback=None, add_result_callback=None, additional_objectives=None, + null_threshold=0.95, random_state=0, verbose=True): """Automated classifier pipeline search @@ -55,6 +56,9 @@ def __init__(self, additional_objectives (list): Custom set of objectives to score on. Will override default objectives for problem type if not empty. + null_threshold(float): Float in range [0,1] that represents what percentage of a feature needs to be + null values for the feature to be considered "highly-null". Default is 0.95. + random_state (int): the random_state verbose (boolean): If True, turn verbosity on. Defaults to True @@ -84,9 +88,10 @@ def __init__(self, detect_label_leakage=detect_label_leakage, start_iteration_callback=start_iteration_callback, add_result_callback=add_result_callback, + additional_objectives=additional_objectives, + null_threshold=null_threshold, random_state=random_state, verbose=verbose, - additional_objectives=additional_objectives ) def set_problem_type(self, objective, multiclass): diff --git a/evalml/models/auto_regressor.py b/evalml/models/auto_regressor.py index 9e0f8c0ab6..ff41386af0 100644 --- a/evalml/models/auto_regressor.py +++ b/evalml/models/auto_regressor.py @@ -19,6 +19,7 @@ def __init__(self, start_iteration_callback=None, add_result_callback=None, additional_objectives=None, + null_threshold=0.95, random_state=0, verbose=True): """Automated regressors pipeline search @@ -50,6 +51,9 @@ def __init__(self, additional_objectives (list): Custom set of objectives to score on. Will override default objectives for problem type if not empty. + null_threshold(float): Float in range [0,1] that represents what percentage of a feature needs to be + null values for the feature to be considered "highly-null". Default is 0.95. + random_state (int): the random_state verbose (boolean): If True, turn verbosity on. Defaults to True @@ -74,7 +78,8 @@ def __init__(self, detect_label_leakage=detect_label_leakage, start_iteration_callback=start_iteration_callback, add_result_callback=add_result_callback, + additional_objectives=additional_objectives, + null_threshold=null_threshold, random_state=random_state, - verbose=verbose, - additional_objectives=additional_objectives + verbose=verbose ) diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py index 79b46ddd04..6f8e8216cd 100644 --- a/evalml/pipelines/__init__.py +++ b/evalml/pipelines/__init__.py @@ -11,4 +11,4 @@ list_model_types, load_pipeline, save_pipeline -) \ No newline at end of file +) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 3c2fcb81a3..99a24327dc 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -90,29 +90,3 @@ def number_of_features(dtypes): def label_distribution(labels): distribution = labels.value_counts() / len(labels) return distribution.mul(100).apply('{:.2f}%'.format).rename_axis('Labels') - - -def detect_label_leakage(X, y, threshold=.95): - """Check if any of the features are highly correlated with the target. - - Currently only supports binary and numeric targets and features - - Args: - X (pd.DataFrame): The input features to check - y (pd.Series): the labels - threshold (float): the correlation threshold to be considered leakage. Defaults to .95 - - Returns: - leakage, dictionary of features with leakage and corresponding threshold - """ - - # only select numeric - numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool'] - X = X.select_dtypes(include=numerics) - - if len(X.columns) == 0: - return {} - - corrs = X.corrwith(y).abs() - out = corrs[corrs >= threshold] - return out.to_dict() diff --git a/evalml/tests/guardrail_tests/test_detect_highly_null.py b/evalml/tests/guardrail_tests/test_detect_highly_null.py new file mode 100644 index 0000000000..24088b483a --- /dev/null +++ b/evalml/tests/guardrail_tests/test_detect_highly_null.py @@ -0,0 +1,23 @@ +import numpy as np +import pandas as pd + +from evalml.guardrails import detect_highly_null + + +def test_detect_highly_null(): + df = pd.DataFrame(np.random.random((100, 5)), columns=list("ABCDE")) + df.loc[:11, 'A'] = np.nan + df.loc[:9, 'B'] = np.nan + df.loc[:30, 'C'] = np.nan + df.loc[:, 'D'] = np.nan + df.loc[:89, 'E'] = np.nan + + expected = {'D': 1.0, 'E': 0.9} + highly_null_set = detect_highly_null(df, percent_threshold=.90) + assert expected == highly_null_set + + # testing np input + nan_arr = np.full((10, 5), np.nan) + expected = {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0} + highly_null_set = detect_highly_null(nan_arr, percent_threshold=1.0) + assert expected == highly_null_set diff --git a/evalml/tests/preprocessing_tests/test_detect_label_leakage.py b/evalml/tests/guardrail_tests/test_detect_label_leakage.py similarity index 86% rename from evalml/tests/preprocessing_tests/test_detect_label_leakage.py rename to evalml/tests/guardrail_tests/test_detect_label_leakage.py index 7aa2d47ecf..73e4a8040b 100644 --- a/evalml/tests/preprocessing_tests/test_detect_label_leakage.py +++ b/evalml/tests/guardrail_tests/test_detect_label_leakage.py @@ -1,6 +1,6 @@ import pandas as pd -from evalml.preprocessing import detect_label_leakage +from evalml.guardrails import detect_label_leakage def test_detect_label_leakage(): diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py index 68bbdf0dd3..e8f774541a 100644 --- a/evalml/utils/__init__.py +++ b/evalml/utils/__init__.py @@ -1,3 +1,3 @@ # flake8:noqa from .logging_utils import Logger -from .convert_time import convert_to_seconds \ No newline at end of file +from .convert_time import convert_to_seconds