From f4738db297bfc63028398ca17437447ef1e377cf Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 6 Nov 2019 11:13:49 -0500 Subject: [PATCH 01/17] moving describe things --- docs/source/changelog.rst | 1 + evalml/models/auto_base.py | 18 +++--------------- evalml/pipelines/pipeline_base.py | 15 +++++++++------ 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index ee0f5ff41f..2c9e052f0f 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -10,6 +10,7 @@ Changelog * Fixes * Changes * Refactoring pipelines :pr:`108` + * Pulling information for describe_pipeline from pipeline's new describe method :pr:`` * Documentation Changes * Testing Changes diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 443b0b9748..1a94aae2ec 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -303,21 +303,9 @@ def describe_pipeline(self, pipeline_id, return_dict=False): pipeline = self.get_pipeline(pipeline_id) pipeline_results = self.results[pipeline_id] - self.logger.log_title("Pipeline Description") - - better_string = "lower is better" - if pipeline.objective.greater_is_better: - better_string = "greater is better" - - self.logger.log("Pipeline Name: %s" % pipeline.name) - self.logger.log("Model type: %s" % pipeline.model_type) - self.logger.log("Objective: %s (%s)" % (pipeline.objective.name, better_string)) - self.logger.log("Total training time (including CV): %.1f seconds\n" % pipeline_results["training_time"]) - - self.logger.log_subtitle("Parameters") - for item in pipeline_results["parameters"].items(): - self.logger.log("• %s: %s" % item) - + pipeline.describe() + + self.logger.log("\nTotal training time (including CV): %.1f seconds" % pipeline_results["training_time"]) self.logger.log_subtitle("\nCross Validation") if pipeline_results["high_variance_cv"]: diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index d02150ad9c..30c1fc35c3 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -76,15 +76,18 @@ def get_component(self, name): return next((component for component in self.component_list if component.name == name), None) def describe(self, return_dict=False): - """Outputs pipeline details including component parameters and cross validation information + """Outputs pipeline details including component parameters - Returns: - - None + Arguments: + return_dict (bool): If True, return dictionary of information + about pipeline. Defaults to false + Returns: + dictionary of all component parameters if return_dict is True, else None """ - title = "Pipeline: " + self.name + title = "Pipeline Name: {}".format(self.name) self.logger.log_title(title) + self.logger.log("Model type: {}".format(self.model_type)) better_string = "lower is better" if self.objective.greater_is_better: @@ -93,7 +96,7 @@ def describe(self, return_dict=False): self.logger.log(objective_string) # Summary of steps - self.logger.log_subtitle("Pipeline Steps") + self.logger.log_subtitle("\nPipeline Steps") for number, component in enumerate(self.component_list, 1): component_string = str(number) + ". " + component.name self.logger.log(component_string) From dda82e4f3c68de4667c570bce4e7e5eff06e094c Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 6 Nov 2019 11:16:17 -0500 Subject: [PATCH 02/17] changelog~ --- docs/source/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2c9e052f0f..b74a4c5047 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -10,7 +10,7 @@ Changelog * Fixes * Changes * Refactoring pipelines :pr:`108` - * Pulling information for describe_pipeline from pipeline's new describe method :pr:`` + * Pulling information for describe_pipeline from pipeline's new describe method :pr:`190` * Documentation Changes * Testing Changes From 1cc2fd70042384383bea47a624ef22869b76ec50 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 6 Nov 2019 11:26:47 -0500 Subject: [PATCH 03/17] linting --- evalml/models/auto_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 1a94aae2ec..ac179ea126 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -304,7 +304,7 @@ def describe_pipeline(self, pipeline_id, return_dict=False): pipeline_results = self.results[pipeline_id] pipeline.describe() - + self.logger.log("\nTotal training time (including CV): %.1f seconds" % pipeline_results["training_time"]) self.logger.log_subtitle("\nCross Validation") From 0024fec31320315c606c4557d6cf9d94c0d69d80 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 6 Nov 2019 15:40:36 -0500 Subject: [PATCH 04/17] updating via comments --- evalml/models/auto_base.py | 5 +++-- evalml/pipelines/pipeline_base.py | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index ac179ea126..05211ec00f 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -304,9 +304,10 @@ def describe_pipeline(self, pipeline_id, return_dict=False): pipeline_results = self.results[pipeline_id] pipeline.describe() + self.logger.log_subtitle("\nTraining") - self.logger.log("\nTotal training time (including CV): %.1f seconds" % pipeline_results["training_time"]) - self.logger.log_subtitle("\nCross Validation") + self.logger.log("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"]) + self.logger.log_subtitle("\nCross Validation", underline="-") if pipeline_results["high_variance_cv"]: self.logger.log("Warning! High variance within cross validation scores. " + diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 30c1fc35c3..23a41bbf74 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -5,11 +5,21 @@ from .components import Estimator, handle_component +from evalml.model_types import ModelTypes from evalml.objectives import get_objective +from evalml.problem_types import ProblemTypes from evalml.utils import Logger class PipelineBase: + + model_type_dict = {ModelTypes.RANDOM_FOREST: "Random Forest", + ModelTypes.XGBOOST: "XGBoost Classifier", + ModelTypes.LINEAR_MODEL: "Linear Model"} + problem_type_dict = {ProblemTypes.BINARY: "Binary Classification", + ProblemTypes.MULTICLASS: "Multiclass Classification", + ProblemTypes.REGRESSION: "Regression"} + def __init__(self, objective, component_list, n_jobs=-1, random_state=0): """Machine learning pipeline made out of transformers and a estimator. @@ -85,14 +95,13 @@ def describe(self, return_dict=False): Returns: dictionary of all component parameters if return_dict is True, else None """ - title = "Pipeline Name: {}".format(self.name) - self.logger.log_title(title) - self.logger.log("Model type: {}".format(self.model_type)) - + self.logger.log_title(self.name) + self.logger.log("Problem Types: {}".format(', '.join([self.problem_type_dict[problem_type] for problem_type in self.problem_types]))) + self.logger.log("Model Type: {}".format(self.model_type_dict[self.model_type])) better_string = "lower is better" if self.objective.greater_is_better: better_string = "greater is better" - objective_string = "Objective: {} ({})".format(self.objective.name, better_string) + objective_string = "Objective to Optimize: {} ({})".format(self.objective.name, better_string) self.logger.log(objective_string) # Summary of steps From a4d4b76d111a481a40bf5e22fd9cf2c44f70ef74 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 Nov 2019 16:03:50 -0500 Subject: [PATCH 05/17] remove dict, add to enum --- evalml/model_types/model_types.py | 5 +++++ evalml/pipelines/pipeline_base.py | 11 ++--------- evalml/problem_types/problem_types.py | 5 +++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/evalml/model_types/model_types.py b/evalml/model_types/model_types.py index 3c2e1ad0fb..334c546b49 100644 --- a/evalml/model_types/model_types.py +++ b/evalml/model_types/model_types.py @@ -6,3 +6,8 @@ class ModelTypes(Enum): RANDOM_FOREST = 'random_forest' XGBOOST = 'xgboost' LINEAR_MODEL = 'linear_model' + + def __str__(self): + if self.value in [ModelTypes.XGBOOST.value]: + return "XGBoost Classifier" + return self.value.replace(" ", "_").title() diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 23a41bbf74..b4634489e4 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -13,13 +13,6 @@ class PipelineBase: - model_type_dict = {ModelTypes.RANDOM_FOREST: "Random Forest", - ModelTypes.XGBOOST: "XGBoost Classifier", - ModelTypes.LINEAR_MODEL: "Linear Model"} - problem_type_dict = {ProblemTypes.BINARY: "Binary Classification", - ProblemTypes.MULTICLASS: "Multiclass Classification", - ProblemTypes.REGRESSION: "Regression"} - def __init__(self, objective, component_list, n_jobs=-1, random_state=0): """Machine learning pipeline made out of transformers and a estimator. @@ -96,8 +89,8 @@ def describe(self, return_dict=False): dictionary of all component parameters if return_dict is True, else None """ self.logger.log_title(self.name) - self.logger.log("Problem Types: {}".format(', '.join([self.problem_type_dict[problem_type] for problem_type in self.problem_types]))) - self.logger.log("Model Type: {}".format(self.model_type_dict[self.model_type])) + self.logger.log("Problem Types: {}".format(', '.join([str(problem_type) for problem_type in self.problem_types]))) + self.logger.log("Model Type: {}".format(str(self.model_type))) better_string = "lower is better" if self.objective.greater_is_better: better_string = "greater is better" diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index f55e8dd0bb..d812ec7e14 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -6,3 +6,8 @@ class ProblemTypes(Enum): BINARY = 'binary' MULTICLASS = 'multiclass' REGRESSION = 'regression' + + def __str__(self): + if self.value in [ProblemTypes.BINARY.value, ProblemTypes.MULTICLASS.value]: + return "{} Classifier".format(self.value.title()) + return self.value.title() From 0a7385b63795a23a4ee245dbee0b7804edb22457 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 8 Nov 2019 17:09:01 -0500 Subject: [PATCH 06/17] reusing dict in str fcn --- evalml/model_types/model_types.py | 7 ++++--- evalml/problem_types/problem_types.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/evalml/model_types/model_types.py b/evalml/model_types/model_types.py index 334c546b49..219d198072 100644 --- a/evalml/model_types/model_types.py +++ b/evalml/model_types/model_types.py @@ -8,6 +8,7 @@ class ModelTypes(Enum): LINEAR_MODEL = 'linear_model' def __str__(self): - if self.value in [ModelTypes.XGBOOST.value]: - return "XGBoost Classifier" - return self.value.replace(" ", "_").title() + model_type_dict = {ModelTypes.RANDOM_FOREST.name: "Random Forest", + ModelTypes.XGBOOST.name: "XGBoost Classifier", + ModelTypes.LINEAR_MODEL.name: "Linear Model"} + return model_type_dict[self.name] diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index d812ec7e14..20752965ac 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -8,6 +8,7 @@ class ProblemTypes(Enum): REGRESSION = 'regression' def __str__(self): - if self.value in [ProblemTypes.BINARY.value, ProblemTypes.MULTICLASS.value]: - return "{} Classifier".format(self.value.title()) - return self.value.title() + problem_type_dict = {ProblemTypes.BINARY.name: "Binary Classification", + ProblemTypes.MULTICLASS.name: "Multiclass Classification", + ProblemTypes.REGRESSION.name: "Regression"} + return problem_type_dict[self.name] From 3beeb16f3748a9247dabbc7e9516299d1609db03 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 8 Nov 2019 17:23:28 -0500 Subject: [PATCH 07/17] addressing comments on describe --- evalml/models/auto_base.py | 7 ++++--- evalml/pipelines/pipeline_base.py | 2 -- evalml/utils/logging_utils.py | 1 + 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index c46c1030cb..d6561f5bf6 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -310,10 +310,11 @@ def describe_pipeline(self, pipeline_id, return_dict=False): pipeline_results = self.results[pipeline_id] pipeline.describe() - self.logger.log_subtitle("\nTraining") - + self.logger.log_subtitle("Training") + # Ideally, we want this information available on pipeline instead + self.logger.log("Training for {} problems.".format(self.problem_type)) self.logger.log("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"]) - self.logger.log_subtitle("\nCross Validation", underline="-") + self.logger.log_subtitle("Cross Validation", underline="-") if pipeline_results["high_variance_cv"]: self.logger.log("Warning! High variance within cross validation scores. " + diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index b4634489e4..8973fb3a2e 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -5,9 +5,7 @@ from .components import Estimator, handle_component -from evalml.model_types import ModelTypes from evalml.objectives import get_objective -from evalml.problem_types import ProblemTypes from evalml.utils import Logger diff --git a/evalml/utils/logging_utils.py b/evalml/utils/logging_utils.py index 6a0ea37f64..af3ab8e968 100644 --- a/evalml/utils/logging_utils.py +++ b/evalml/utils/logging_utils.py @@ -24,5 +24,6 @@ def log_title(self, title): self.log("") def log_subtitle(self, title, underline="=", color=None): + self.log("") self.log("%s" % title, color=color) self.log(underline * len(title), color=color) From b81d793bed472d5aa7ccca4d036e31ffeebd6056 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 8 Nov 2019 18:01:37 -0500 Subject: [PATCH 08/17] updating notebooks --- docs/source/automl/guardrails.ipynb | 87 +++-- docs/source/automl/regression_example.ipynb | 107 +++--- docs/source/automl/search_results.ipynb | 405 ++++++++++---------- docs/source/demos/fraud.ipynb | 192 ++++++---- docs/source/index.ipynb | 83 ++-- 5 files changed, 470 insertions(+), 404 deletions(-) diff --git a/docs/source/automl/guardrails.ipynb b/docs/source/automl/guardrails.ipynb index 66adb8be7c..0e7688575b 100644 --- a/docs/source/automl/guardrails.ipynb +++ b/docs/source/automl/guardrails.ipynb @@ -46,13 +46,12 @@ "\n", "Optimizing for Precision. Greater score is better.\n", "\n", - "Searching up to 1 pipelines. No time limit is set. Set one using max_time parameter.\n", - "\n", + "Searching up to 1 pipelines. \n", "Possible model types: linear_model\n", "\n", "WARNING: Possible label leakage: leaked_feature, leaked_feature_2\n", - "✔ LogisticRegression w/ imputation + scaling: 0%| | Elapsed:00:03\n", - "✔ LogisticRegression w/ imputation + scaling: 100%|██████████| Elapsed:00:03\n", + "✔ Logistic Regression Classifier w/ O... 0%| | Elapsed:00:08\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:08\n", "\n", "✔ Optimization finished\n" ] @@ -182,12 +181,11 @@ "\n", "Optimizing for Precision. Greater score is better.\n", "\n", - "Searching up to 1 pipelines. No time limit is set. Set one using max_time parameter.\n", - "\n", - "Possible model types: linear_model, random_forest, xgboost\n", + "Searching up to 1 pipelines. \n", + "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n", - "✔ XGBoost w/ imputation: 100%|██████████| Elapsed:00:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 100%|██████████| Elapsed:00:00\n", "\n", "✔ Optimization finished\n" ] @@ -222,35 +220,45 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\n", + "Problem Types: Binary Classification, Multiclass Classification\n", + "Model Type: XGBoost Classifier\n", + "Objective to Optimize: Precision (greater is better)\n", "\n", - "Pipeline Name: XGBoost w/ imputation\n", - "Model type: ModelTypes.XGBOOST\n", - "Objective: Precision (greater is better)\n", - "Total training time (including CV): 0.4 seconds\n", "\n", - "Parameters\n", - "==========\n", - "• eta: 0.5928446182250184\n", - "• min_child_weight: 8.598391737229157\n", - "• max_depth: 4\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.6273280598181127\n", + "Pipeline Steps\n", + "===============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : most_frequent\n", + "3. RF Classifier Select From Model\n", + "\t * percent_features : 0.6273280598181127\n", + "\t * threshold : -inf\n", + "4. XGBoost Classifier\n", + "\t * eta : 0.5928446182250184\n", + "\t * max_depth : 4\n", + "\t * min_child_weight : 8.598391737229157\n", + "\n", + "Training\n", + "========\n", + "Training for Binary Classification problems.\n", + "Total training time (including CV): 0.4 seconds\n", "\n", "Cross Validation\n", - "=================\n", + "----------------\n", " Precision F1 Recall AUC Log Loss MCC # Training # Testing\n", "0 0.974 0.822 0.822 0.950 0.578 0.650 83.000 81.000\n", "1 1.000 0.988 0.988 1.000 0.163 0.976 164.000 81.000\n", - "2 0.964 0.972 0.972 0.968 0.134 0.916 245.000 81.000\n", - "3 1.000 0.955 0.955 0.997 0.106 0.866 326.000 81.000\n", - "4 1.000 0.968 0.968 0.998 0.116 0.871 407.000 81.000\n", - "5 0.983 0.983 0.983 0.998 0.077 0.936 488.000 81.000\n", - "mean 0.987 0.948 0.948 0.985 0.196 0.869 - -\n", - "std 0.016 0.063 0.063 0.021 0.190 0.115 - -\n", - "coef of var 0.016 0.066 0.066 0.021 0.969 0.132 - -\n" + "2 0.981 0.981 0.981 0.968 0.139 0.944 245.000 81.000\n", + "3 0.963 0.929 0.929 0.991 0.113 0.774 326.000 81.000\n", + "4 0.984 0.960 0.960 0.993 0.147 0.830 407.000 81.000\n", + "5 0.983 0.983 0.983 0.998 0.083 0.936 488.000 81.000\n", + "mean 0.981 0.944 0.944 0.983 0.204 0.852 - -\n", + "std 0.012 0.064 0.064 0.020 0.186 0.125 - -\n", + "coef of var 0.013 0.067 0.067 0.020 0.909 0.147 - -\n" ] } ], @@ -309,7 +317,7 @@ " 0\n", " 0\n", " XGBoostPipeline\n", - " 0.986776\n", + " 0.980845\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", @@ -319,7 +327,7 @@ ], "text/plain": [ " id pipeline_name score high_variance_cv \\\n", - "0 0 XGBoostPipeline 0.986776 False \n", + "0 0 XGBoostPipeline 0.980845 False \n", "\n", " parameters \n", "0 {'eta': 0.5928446182250184, 'min_child_weight'... " @@ -370,14 +378,13 @@ "\n", "Optimizing for Recall. Greater score is better.\n", "\n", - "Searching up to 3 pipelines. No time limit is set. Set one using max_time parameter.\n", - "\n", - "Possible model types: linear_model, random_forest, xgboost\n", + "Searching up to 3 pipelines. \n", + "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n", - "✔ XGBoost w/ imputation: 33%|███▎ | Elapsed:00:00\n", - "✔ Random Forest w/ imputation: 67%|██████▋ | Elapsed:00:06\n", - "✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:06\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 33%|███▎ | Elapsed:00:00\n", + "✔ Random Forest Classifier w/ One Hot... 67%|██████▋ | Elapsed:00:06\n", + "✔ Random Forest Classifier w/ One Hot... 100%|██████████| Elapsed:00:06\n", "\n", "✔ Optimization finished\n" ] diff --git a/docs/source/automl/regression_example.ipynb b/docs/source/automl/regression_example.ipynb index 22246296c6..98019ca5e3 100644 --- a/docs/source/automl/regression_example.ipynb +++ b/docs/source/automl/regression_example.ipynb @@ -22,16 +22,15 @@ "\n", "Optimizing for R2. Greater score is better.\n", "\n", - "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. \n", + "Possible model types: linear_model, random_forest\n", "\n", - "Possible model types: random_forest\n", - "\n", - "✔ Random Forest w/ imputation: 0%| | Elapsed:00:05\n", - "✔ Random Forest w/ imputation: 20%|██ | Elapsed:00:10\n", - "✔ Random Forest w/ imputation: 40%|████ | Elapsed:00:16\n", - "✔ Random Forest w/ imputation: 60%|██████ | Elapsed:00:22\n", - "✔ Random Forest w/ imputation: 80%|████████ | Elapsed:00:30\n", - "✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:30\n", + "✔ Random Forest Regressor w/ One Hot ... 0%| | Elapsed:00:06\n", + "✔ Random Forest Regressor w/ One Hot ... 20%|██ | Elapsed:00:10\n", + "✔ Linear Regressor w/ One Hot Encoder... 40%|████ | Elapsed:00:10\n", + "✔ Random Forest Regressor w/ One Hot ... 40%|████ | Elapsed:00:16\n", + "✔ Random Forest Regressor w/ One Hot ... 80%|████████ | Elapsed:00:30\n", + "✔ Random Forest Regressor w/ One Hot ... 100%|██████████| Elapsed:00:30\n", "\n", "✔ Optimization finished\n" ] @@ -86,23 +85,23 @@ " \n", " \n", " 0\n", - " 0\n", - " RFRegressionPipeline\n", - " 0.422322\n", + " 2\n", + " LinearRegressionPipeline\n", + " 0.488703\n", " False\n", - " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", + " {'impute_strategy': 'mean', 'normalize': True,...\n", " \n", " \n", " 1\n", - " 4\n", + " 0\n", " RFRegressionPipeline\n", - " 0.417416\n", + " 0.422322\n", " False\n", - " {'n_estimators': 859, 'max_depth': 6, 'impute_...\n", + " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", " 2\n", - " 3\n", + " 4\n", " RFRegressionPipeline\n", " 0.391463\n", " False\n", @@ -110,7 +109,7 @@ " \n", " \n", " 3\n", - " 2\n", + " 3\n", " RFRegressionPipeline\n", " 0.383134\n", " False\n", @@ -129,16 +128,16 @@ "" ], "text/plain": [ - " id pipeline_name score high_variance_cv \\\n", - "0 0 RFRegressionPipeline 0.422322 False \n", - "1 4 RFRegressionPipeline 0.417416 False \n", - "2 3 RFRegressionPipeline 0.391463 False \n", - "3 2 RFRegressionPipeline 0.383134 False \n", - "4 1 RFRegressionPipeline 0.381204 False \n", + " id pipeline_name score high_variance_cv \\\n", + "0 2 LinearRegressionPipeline 0.488703 False \n", + "1 0 RFRegressionPipeline 0.422322 False \n", + "2 4 RFRegressionPipeline 0.391463 False \n", + "3 3 RFRegressionPipeline 0.383134 False \n", + "4 1 RFRegressionPipeline 0.381204 False \n", "\n", " parameters \n", - "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "1 {'n_estimators': 859, 'max_depth': 6, 'impute_... \n", + "0 {'impute_strategy': 'mean', 'normalize': True,... \n", + "1 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", "2 {'n_estimators': 715, 'max_depth': 7, 'impute_... \n", "3 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", "4 {'n_estimators': 369, 'max_depth': 10, 'impute... " @@ -161,7 +160,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -181,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -202,31 +201,41 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m************************************************************************************************\u001b[0m\n", + "\u001b[1m* Random Forest Regressor w/ One Hot Encoder + Simple Imputer + RF Regressor Select From Model *\u001b[0m\n", + "\u001b[1m************************************************************************************************\u001b[0m\n", + "\n", + "Problem Types: Regression\n", + "Model Type: Random Forest\n", + "Objective to Optimize: R2 (greater is better)\n", + "\n", "\n", - "Pipeline Name: Random Forest w/ imputation\n", - "Model type: ModelTypes.RANDOM_FOREST\n", - "Objective: R2 (greater is better)\n", - "Total training time (including CV): 5.8 seconds\n", + "Pipeline Steps\n", + "===============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : most_frequent\n", + "3. RF Regressor Select From Model\n", + "\t * percent_features : 0.8593661614465293\n", + "\t * threshold : -inf\n", + "4. Random Forest Regressor\n", + "\t * n_estimators : 569\n", + "\t * max_depth : 22\n", "\n", - "Parameters\n", - "==========\n", - "• n_estimators: 569\n", - "• max_depth: 22\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.8593661614465293\n", + "Training\n", + "========\n", + "Training for Regression problems.\n", + "Total training time (including CV): 6.0 seconds\n", "\n", "Cross Validation\n", - "=================\n", - " R2 # Training # Testing\n", - "0 0.427 294.000 148.000\n", - "1 0.450 295.000 147.000\n", - "2 0.390 295.000 147.000\n", - "mean 0.422 - -\n", - "std 0.031 - -\n", - "coef of var 0.072 - -\n" + "----------------\n", + " R2 MAE MSE MSLE MedianAE MaxError ExpVariance # Training # Testing\n", + "0 0.427 46.033 3276.018 0.194 39.699 161.858 0.428 294.000 148.000\n", + "1 0.450 48.953 3487.566 0.193 44.344 160.513 0.451 295.000 147.000\n", + "2 0.390 47.401 3477.117 0.193 41.297 171.420 0.390 295.000 147.000\n", + "mean 0.422 47.462 3413.567 0.193 41.780 164.597 0.423 - -\n", + "std 0.031 1.461 119.235 0.000 2.360 5.947 0.031 - -\n", + "coef of var 0.072 0.031 0.035 0.002 0.056 0.036 0.073 - -\n" ] } ], diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb index 3f4ea5654f..71a2d572b9 100644 --- a/docs/source/automl/search_results.ipynb +++ b/docs/source/automl/search_results.ipynb @@ -24,21 +24,20 @@ "\n", "Optimizing for F1. Greater score is better.\n", "\n", - "Searching up to 10 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 10 pipelines. \n", + "Possible model types: linear_model, xgboost, random_forest\n", "\n", - "Possible model types: xgboost, linear_model, random_forest\n", - "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n", - "✔ XGBoost w/ imputation: 10%|█ | Elapsed:00:00\n", - "✔ Random Forest w/ imputation: 20%|██ | Elapsed:00:06\n", - "✔ XGBoost w/ imputation: 30%|███ | Elapsed:00:06\n", - "✔ LogisticRegression w/ imputation + ... 40%|████ | Elapsed:00:10\n", - "✔ XGBoost w/ imputation: 50%|█████ | Elapsed:00:10\n", - "✔ LogisticRegression w/ imputation + ... 60%|██████ | Elapsed:00:13\n", - "✔ XGBoost w/ imputation: 70%|███████ | Elapsed:00:13\n", - "✔ LogisticRegression w/ imputation + ... 80%|████████ | Elapsed:00:17\n", - "✔ LogisticRegression w/ imputation + ... 90%|█████████ | Elapsed:00:20\n", - "✔ LogisticRegression w/ imputation + ... 100%|██████████| Elapsed:00:20\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 10%|█ | Elapsed:00:00\n", + "✔ Random Forest Classifier w/ One Hot... 20%|██ | Elapsed:00:06\n", + "✔ XGBoost Classifier w/ One Hot Encod... 30%|███ | Elapsed:00:06\n", + "✔ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:00:14\n", + "✔ XGBoost Classifier w/ One Hot Encod... 50%|█████ | Elapsed:00:14\n", + "✔ Logistic Regression Classifier w/ O... 60%|██████ | Elapsed:00:22\n", + "✔ XGBoost Classifier w/ One Hot Encod... 70%|███████ | Elapsed:00:22\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:32\n", + "✔ Logistic Regression Classifier w/ O... 90%|█████████ | Elapsed:00:39\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:39\n", "\n", "✔ Optimization finished\n" ] @@ -133,20 +132,12 @@ " 4\n", " 1\n", " XGBoostPipeline\n", - " 0.970674\n", + " 0.970626\n", " False\n", " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", " \n", " 5\n", - " 5\n", - " XGBoostPipeline\n", - " 0.969254\n", - " False\n", - " {'eta': 0.6481718720511973, 'min_child_weight'...\n", - " \n", - " \n", - " 6\n", " 2\n", " RFClassificationPipeline\n", " 0.966846\n", @@ -154,10 +145,18 @@ " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", + " 6\n", + " 5\n", + " XGBoostPipeline\n", + " 0.966592\n", + " False\n", + " {'eta': 0.6481718720511973, 'min_child_weight'...\n", + " \n", + " \n", " 7\n", " 0\n", " XGBoostPipeline\n", - " 0.965195\n", + " 0.965192\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", @@ -165,7 +164,7 @@ " 8\n", " 7\n", " XGBoostPipeline\n", - " 0.965195\n", + " 0.963913\n", " False\n", " {'eta': 0.9786183422327642, 'min_child_weight'...\n", " \n", @@ -173,7 +172,7 @@ " 9\n", " 3\n", " XGBoostPipeline\n", - " 0.960739\n", + " 0.952237\n", " False\n", " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", @@ -187,12 +186,12 @@ "1 6 LogisticRegressionPipeline 0.974853 False \n", "2 9 LogisticRegressionPipeline 0.974853 False \n", "3 4 LogisticRegressionPipeline 0.973411 False \n", - "4 1 XGBoostPipeline 0.970674 False \n", - "5 5 XGBoostPipeline 0.969254 False \n", - "6 2 RFClassificationPipeline 0.966846 False \n", - "7 0 XGBoostPipeline 0.965195 False \n", - "8 7 XGBoostPipeline 0.965195 False \n", - "9 3 XGBoostPipeline 0.960739 False \n", + "4 1 XGBoostPipeline 0.970626 False \n", + "5 2 RFClassificationPipeline 0.966846 False \n", + "6 5 XGBoostPipeline 0.966592 False \n", + "7 0 XGBoostPipeline 0.965192 False \n", + "8 7 XGBoostPipeline 0.963913 False \n", + "9 3 XGBoostPipeline 0.952237 False \n", "\n", " parameters \n", "0 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", @@ -200,8 +199,8 @@ "2 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", "3 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", "4 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "5 {'eta': 0.6481718720511973, 'min_child_weight'... \n", - "6 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", + "5 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", + "6 {'eta': 0.6481718720511973, 'min_child_weight'... \n", "7 {'eta': 0.5928446182250184, 'min_child_weight'... \n", "8 {'eta': 0.9786183422327642, 'min_child_weight'... \n", "9 {'eta': 0.5288949197529046, 'min_child_weight'... " @@ -233,32 +232,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\n", + "Problem Types: Binary Classification, Multiclass Classification\n", + "Model Type: XGBoost Classifier\n", + "Objective to Optimize: F1 (greater is better)\n", + "\n", "\n", - "Pipeline Name: XGBoost w/ imputation\n", - "Model type: ModelTypes.XGBOOST\n", - "Objective: F1 (greater is better)\n", - "Total training time (including CV): 0.3 seconds\n", + "Pipeline Steps\n", + "===============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : most_frequent\n", + "3. RF Classifier Select From Model\n", + "\t * percent_features : 0.6273280598181127\n", + "\t * threshold : -inf\n", + "4. XGBoost Classifier\n", + "\t * eta : 0.5928446182250184\n", + "\t * max_depth : 4\n", + "\t * min_child_weight : 8.598391737229157\n", "\n", - "Parameters\n", - "==========\n", - "• eta: 0.5928446182250184\n", - "• min_child_weight: 8.598391737229157\n", - "• max_depth: 4\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.6273280598181127\n", + "Training\n", + "========\n", + "Training for Binary Classification problems.\n", + "Total training time (including CV): 0.2 seconds\n", "\n", "Cross Validation\n", - "=================\n", + "----------------\n", " F1 Precision Recall AUC Log Loss MCC # Training # Testing\n", - "0 0.959 0.943 0.959 0.987 0.150 0.887 379.000 190.000\n", - "1 0.975 0.959 0.975 0.996 0.106 0.933 379.000 190.000\n", - "2 0.962 0.974 0.962 0.983 0.134 0.899 380.000 189.000\n", - "mean 0.965 0.959 0.965 0.988 0.130 0.906 - -\n", - "std 0.009 0.016 0.009 0.006 0.022 0.024 - -\n", - "coef of var 0.009 0.016 0.009 0.007 0.172 0.026 - -\n" + "0 0.950 0.935 0.950 0.985 0.154 0.864 379.000 190.000\n", + "1 0.975 0.959 0.975 0.996 0.102 0.933 379.000 190.000\n", + "2 0.970 0.991 0.970 0.983 0.137 0.923 380.000 189.000\n", + "mean 0.965 0.962 0.965 0.988 0.131 0.907 - -\n", + "std 0.013 0.028 0.013 0.007 0.026 0.037 - -\n", + "coef of var 0.014 0.029 0.014 0.007 0.202 0.041 - -\n" ] } ], @@ -282,7 +291,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -310,7 +319,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -365,91 +374,91 @@ " \n", " 0\n", " 22\n", - " 0.371201\n", + " 0.407441\n", " \n", " \n", " 1\n", - " 27\n", - " 0.153827\n", + " 7\n", + " 0.239457\n", " \n", " \n", " 2\n", - " 7\n", - " 0.145848\n", + " 27\n", + " 0.120609\n", " \n", " \n", " 3\n", " 20\n", - " 0.087861\n", + " 0.072031\n", " \n", " \n", " 4\n", " 23\n", - " 0.052571\n", + " 0.052818\n", " \n", " \n", " 5\n", - " 26\n", - " 0.044619\n", + " 6\n", + " 0.038344\n", " \n", " \n", " 6\n", - " 6\n", - " 0.036699\n", + " 1\n", + " 0.033962\n", " \n", " \n", " 7\n", - " 13\n", - " 0.032339\n", + " 21\n", + " 0.028949\n", " \n", " \n", " 8\n", - " 1\n", - " 0.026583\n", + " 4\n", + " 0.003987\n", " \n", " \n", " 9\n", - " 24\n", - " 0.026560\n", + " 25\n", + " 0.002403\n", " \n", " \n", " 10\n", - " 21\n", - " 0.021891\n", + " 0\n", + " 0.000000\n", " \n", " \n", " 11\n", - " 0\n", + " 2\n", " 0.000000\n", " \n", " \n", " 12\n", - " 2\n", + " 3\n", " 0.000000\n", " \n", " \n", " 13\n", - " 3\n", + " 12\n", " 0.000000\n", " \n", " \n", " 14\n", - " 4\n", + " 13\n", " 0.000000\n", " \n", " \n", " 15\n", - " 5\n", + " 18\n", " 0.000000\n", " \n", " \n", " 16\n", - " 8\n", + " 19\n", " 0.000000\n", " \n", " \n", " 17\n", - " 9\n", + " 29\n", " 0.000000\n", " \n", " \n", @@ -458,24 +467,24 @@ ], "text/plain": [ " feature importance\n", - "0 22 0.371201\n", - "1 27 0.153827\n", - "2 7 0.145848\n", - "3 20 0.087861\n", - "4 23 0.052571\n", - "5 26 0.044619\n", - "6 6 0.036699\n", - "7 13 0.032339\n", - "8 1 0.026583\n", - "9 24 0.026560\n", - "10 21 0.021891\n", - "11 0 0.000000\n", - "12 2 0.000000\n", - "13 3 0.000000\n", - "14 4 0.000000\n", - "15 5 0.000000\n", - "16 8 0.000000\n", - "17 9 0.000000" + "0 22 0.407441\n", + "1 7 0.239457\n", + "2 27 0.120609\n", + "3 20 0.072031\n", + "4 23 0.052818\n", + "5 6 0.038344\n", + "6 1 0.033962\n", + "7 21 0.028949\n", + "8 4 0.003987\n", + "9 25 0.002403\n", + "10 0 0.000000\n", + "11 2 0.000000\n", + "12 3 0.000000\n", + "13 12 0.000000\n", + "14 13 0.000000\n", + "15 18 0.000000\n", + "16 19 0.000000\n", + "17 29 0.000000" ] }, "execution_count": 6, @@ -511,34 +520,34 @@ " 'max_depth': 4,\n", " 'impute_strategy': 'most_frequent',\n", " 'percent_features': 0.6273280598181127},\n", - " 'score': 0.9651954750600785,\n", + " 'score': 0.9651923054186028,\n", " 'high_variance_cv': False,\n", - " 'scores': [0.9586776859504134, 0.9752066115702479, 0.9617021276595743],\n", - " 'all_objective_scores': [OrderedDict([('F1', 0.9586776859504134),\n", - " ('Precision', 0.943089430894309),\n", - " ('Recall', 0.9586776859504134),\n", - " ('AUC', 0.9865664575689431),\n", - " ('Log Loss', 0.14983175628279385),\n", - " ('MCC', 0.8871869342405617),\n", + " 'scores': [0.9504132231404958, 0.9752066115702479, 0.9699570815450643],\n", + " 'all_objective_scores': [OrderedDict([('F1', 0.9504132231404958),\n", + " ('Precision', 0.9349593495934959),\n", + " ('Recall', 0.9504132231404958),\n", + " ('AUC', 0.984731920937389),\n", + " ('Log Loss', 0.1536501646237938),\n", + " ('MCC', 0.8644170412909863),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", " OrderedDict([('F1', 0.9752066115702479),\n", " ('Precision', 0.959349593495935),\n", " ('Recall', 0.9752066115702479),\n", - " ('AUC', 0.9955616049236596),\n", - " ('Log Loss', 0.10579590414111552),\n", + " ('AUC', 0.9960350337318026),\n", + " ('Log Loss', 0.10194972519713798),\n", " ('MCC', 0.9327267201397125),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9617021276595743),\n", - " ('Precision', 0.9741379310344828),\n", - " ('Recall', 0.9617021276595743),\n", - " ('AUC', 0.9829531812725091),\n", - " ('Log Loss', 0.13361063377843016),\n", - " ('MCC', 0.8993040708411105),\n", + " OrderedDict([('F1', 0.9699570815450643),\n", + " ('Precision', 0.9912280701754386),\n", + " ('Recall', 0.9699570815450643),\n", + " ('AUC', 0.983313325330132),\n", + " ('Log Loss', 0.13664108953345075),\n", + " ('MCC', 0.9231826763268304),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.2605268955230713},\n", + " 'training_time': 0.2018277645111084},\n", " 1: {'id': 1,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.38438170729269994,\n", @@ -546,34 +555,34 @@ " 'max_depth': 13,\n", " 'impute_strategy': 'median',\n", " 'percent_features': 0.793807787701838},\n", - " 'score': 0.9706738245383719,\n", + " 'score': 0.9706261399583499,\n", " 'high_variance_cv': False,\n", - " 'scores': [0.9666666666666667, 0.979253112033195, 0.9661016949152542],\n", - " 'all_objective_scores': [OrderedDict([('F1', 0.9666666666666667),\n", - " ('Precision', 0.9586776859504132),\n", - " ('Recall', 0.9666666666666667),\n", - " ('AUC', 0.9918333530595337),\n", - " ('Log Loss', 0.11837350234119712),\n", - " ('MCC', 0.9097672817424011),\n", + " 'scores': [0.9707112970711297, 0.9709543568464729, 0.9702127659574468],\n", + " 'all_objective_scores': [OrderedDict([('F1', 0.9707112970711297),\n", + " ('Precision', 0.9666666666666667),\n", + " ('Recall', 0.9707112970711297),\n", + " ('AUC', 0.9917149958574978),\n", + " ('Log Loss', 0.11573912222489813),\n", + " ('MCC', 0.9211268105467613),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.979253112033195),\n", - " ('Precision', 0.9672131147540983),\n", - " ('Recall', 0.979253112033195),\n", - " ('AUC', 0.9963309267368918),\n", - " ('Log Loss', 0.08243243813477946),\n", - " ('MCC', 0.943843520216036),\n", + " OrderedDict([('F1', 0.9709543568464729),\n", + " ('Precision', 0.9590163934426229),\n", + " ('Recall', 0.9709543568464729),\n", + " ('AUC', 0.9969227127470707),\n", + " ('Log Loss', 0.07704140599817037),\n", + " ('MCC', 0.9211492315750531),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9661016949152542),\n", - " ('Precision', 0.9743589743589743),\n", - " ('Recall', 0.9661016949152542),\n", - " ('AUC', 0.9858343337334934),\n", - " ('Log Loss', 0.1252105505447145),\n", - " ('MCC', 0.9100059668642326),\n", + " OrderedDict([('F1', 0.9702127659574468),\n", + " ('Precision', 0.9827586206896551),\n", + " ('Recall', 0.9702127659574468),\n", + " ('AUC', 0.9857142857142858),\n", + " ('Log Loss', 0.12628072744331484),\n", + " ('MCC', 0.9218075091290715),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.33245086669921875},\n", + " 'training_time': 0.26802611351013184},\n", " 2: {'id': 2,\n", " 'pipeline_name': 'RFClassificationPipeline',\n", " 'parameters': {'n_estimators': 569,\n", @@ -607,7 +616,7 @@ " ('MCC', 0.9208800271662652),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 6.14126992225647},\n", + " 'training_time': 6.220675945281982},\n", " 3: {'id': 3,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.5288949197529046,\n", @@ -615,34 +624,34 @@ " 'max_depth': 6,\n", " 'impute_strategy': 'most_frequent',\n", " 'percent_features': 0.34402219881309576},\n", - " 'score': 0.9607393479447351,\n", + " 'score': 0.9522372250281359,\n", " 'high_variance_cv': False,\n", - " 'scores': [0.9504132231404958, 0.9794238683127572, 0.9523809523809523],\n", - " 'all_objective_scores': [OrderedDict([('F1', 0.9504132231404958),\n", - " ('Precision', 0.9349593495934959),\n", - " ('Recall', 0.9504132231404958),\n", - " ('AUC', 0.9865664575689431),\n", - " ('Log Loss', 0.14135858728060205),\n", - " ('MCC', 0.8644170412909863),\n", + " 'scores': [0.9367088607594938, 0.9672131147540983, 0.9527896995708156],\n", + " 'all_objective_scores': [OrderedDict([('F1', 0.9367088607594938),\n", + " ('Precision', 0.940677966101695),\n", + " ('Recall', 0.9367088607594938),\n", + " ('AUC', 0.9821872410936205),\n", + " ('Log Loss', 0.16857726289155453),\n", + " ('MCC', 0.8318710075349047),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9794238683127572),\n", - " ('Precision', 0.9596774193548387),\n", - " ('Recall', 0.9794238683127572),\n", - " ('AUC', 0.9960350337318026),\n", - " ('Log Loss', 0.08963871603513038),\n", - " ('MCC', 0.9445075449666159),\n", + " OrderedDict([('F1', 0.9672131147540983),\n", + " ('Precision', 0.944),\n", + " ('Recall', 0.9672131147540983),\n", + " ('AUC', 0.9937270682921056),\n", + " ('Log Loss', 0.10433676971098114),\n", + " ('MCC', 0.9106361866954563),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9523809523809523),\n", - " ('Precision', 0.9821428571428571),\n", - " ('Recall', 0.9523809523809523),\n", - " ('AUC', 0.985954381752701),\n", - " ('Log Loss', 0.13301814707838708),\n", - " ('MCC', 0.8803966271554114),\n", + " OrderedDict([('F1', 0.9527896995708156),\n", + " ('Precision', 0.9736842105263158),\n", + " ('Recall', 0.9527896995708156),\n", + " ('AUC', 0.9845138055222089),\n", + " ('Log Loss', 0.14270813120701523),\n", + " ('MCC', 0.8783921421654207),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.25251102447509766},\n", + " 'training_time': 0.17779779434204102},\n", " 4: {'id': 4,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -675,7 +684,7 @@ " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 3.094980001449585},\n", + " 'training_time': 7.575798034667969},\n", " 5: {'id': 5,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.6481718720511973,\n", @@ -683,34 +692,34 @@ " 'max_depth': 6,\n", " 'impute_strategy': 'most_frequent',\n", " 'percent_features': 0.871312026764351},\n", - " 'score': 0.969254157920668,\n", + " 'score': 0.966592074666908,\n", " 'high_variance_cv': False,\n", - " 'scores': [0.9623430962343097, 0.9752066115702479, 0.9702127659574468],\n", - " 'all_objective_scores': [OrderedDict([('F1', 0.9623430962343097),\n", - " ('Precision', 0.9583333333333334),\n", - " ('Recall', 0.9623430962343097),\n", - " ('AUC', 0.9901763522310333),\n", - " ('Log Loss', 0.12383983944248604),\n", - " ('MCC', 0.8985734479173947),\n", + " 'scores': [0.9543568464730291, 0.9752066115702479, 0.9702127659574468],\n", + " 'all_objective_scores': [OrderedDict([('F1', 0.9543568464730291),\n", + " ('Precision', 0.9426229508196722),\n", + " ('Recall', 0.9543568464730291),\n", + " ('AUC', 0.9899396378269618),\n", + " ('Log Loss', 0.12702225128151967),\n", + " ('MCC', 0.8757606542930872),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", " OrderedDict([('F1', 0.9752066115702479),\n", " ('Precision', 0.959349593495935),\n", " ('Recall', 0.9752066115702479),\n", - " ('AUC', 0.9962125695348562),\n", - " ('Log Loss', 0.08514171470765416),\n", + " ('AUC', 0.9965676411409634),\n", + " ('Log Loss', 0.0801103590350402),\n", " ('MCC', 0.9327267201397125),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", " OrderedDict([('F1', 0.9702127659574468),\n", " ('Precision', 0.9827586206896551),\n", " ('Recall', 0.9702127659574468),\n", - " ('AUC', 0.9860744297719087),\n", - " ('Log Loss', 0.12377072123640645),\n", + " ('AUC', 0.9858343337334934),\n", + " ('Log Loss', 0.1270006743029361),\n", " ('MCC', 0.9218075091290715),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.35909104347229004},\n", + " 'training_time': 0.3415100574493408},\n", " 6: {'id': 6,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -743,7 +752,7 @@ " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 3.22955584526062},\n", + " 'training_time': 7.9174652099609375},\n", " 7: {'id': 7,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.9786183422327642,\n", @@ -751,34 +760,34 @@ " 'max_depth': 20,\n", " 'impute_strategy': 'median',\n", " 'percent_features': 0.6820907348177707},\n", - " 'score': 0.9651954750600785,\n", + " 'score': 0.9639126305792973,\n", " 'high_variance_cv': False,\n", - " 'scores': [0.9586776859504134, 0.9752066115702479, 0.9617021276595743],\n", - " 'all_objective_scores': [OrderedDict([('F1', 0.9586776859504134),\n", - " ('Precision', 0.943089430894309),\n", - " ('Recall', 0.9586776859504134),\n", - " ('AUC', 0.986566457568943),\n", - " ('Log Loss', 0.1476227625175134),\n", - " ('MCC', 0.8871869342405617),\n", + " 'scores': [0.9547325102880658, 0.9711934156378601, 0.9658119658119659],\n", + " 'all_objective_scores': [OrderedDict([('F1', 0.9547325102880658),\n", + " ('Precision', 0.9354838709677419),\n", + " ('Recall', 0.9547325102880658),\n", + " ('AUC', 0.9853237069475678),\n", + " ('Log Loss', 0.15021697619047605),\n", + " ('MCC', 0.8759603969361893),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9752066115702479),\n", - " ('Precision', 0.959349593495935),\n", - " ('Recall', 0.9752066115702479),\n", - " ('AUC', 0.9962125695348562),\n", - " ('Log Loss', 0.09923503662116434),\n", - " ('MCC', 0.9327267201397125),\n", + " OrderedDict([('F1', 0.9711934156378601),\n", + " ('Precision', 0.9516129032258065),\n", + " ('Recall', 0.9711934156378601),\n", + " ('AUC', 0.9950289975144987),\n", + " ('Log Loss', 0.10607622409680564),\n", + " ('MCC', 0.9216584956231404),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", - " OrderedDict([('F1', 0.9617021276595743),\n", - " ('Precision', 0.9741379310344828),\n", - " ('Recall', 0.9617021276595743),\n", + " OrderedDict([('F1', 0.9658119658119659),\n", + " ('Precision', 0.9826086956521739),\n", + " ('Recall', 0.9658119658119659),\n", " ('AUC', 0.9834333733493397),\n", - " ('Log Loss', 0.13148702966620918),\n", - " ('MCC', 0.8993040708411105),\n", + " ('Log Loss', 0.13131227825704234),\n", + " ('MCC', 0.9112159507396058),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.2930469512939453},\n", + " 'training_time': 0.2536630630493164},\n", " 8: {'id': 8,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -811,7 +820,7 @@ " ('MCC', 0.9443109474170326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 3.4265527725219727},\n", + " 'training_time': 9.234977006912231},\n", " 9: {'id': 9,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -844,7 +853,7 @@ " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 3.400575876235962}}" + " 'training_time': 7.72037410736084}}" ] }, "execution_count": 7, diff --git a/docs/source/demos/fraud.ipynb b/docs/source/demos/fraud.ipynb index 58c16a9148..814e65b6e7 100644 --- a/docs/source/demos/fraud.ipynb +++ b/docs/source/demos/fraud.ipynb @@ -146,16 +146,27 @@ "\n", "Optimizing for Fraud Cost. Lower score is better.\n", "\n", - "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. \n", + "Possible model types: random_forest, xgboost, linear_model\n", "\n", - "Possible model types: xgboost, linear_model, random_forest\n", - "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:01:44\n", - "✔ XGBoost w/ imputation: 20%|██ | Elapsed:05:09\n", - "✔ Random Forest w/ imputation: 40%|████ | Elapsed:05:54\n", - "✔ XGBoost w/ imputation: 60%|██████ | Elapsed:07:45\n", - "✔ LogisticRegression w/ imputation + ... 80%|████████ | Elapsed:08:05\n", - "✔ LogisticRegression w/ imputation + ... 100%|██████████| Elapsed:08:05\n", + "▹ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount, lat, lng\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount, lat, lng\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields card_id, store_id, amount, lat\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:12\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:39\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:02:38\n", + "▹ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:38DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields amount, lng\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:50\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:03:16\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:03:16\n", "\n", "✔ Optimization finished\n" ] @@ -230,27 +241,27 @@ " \n", " \n", " 2\n", - " 0\n", - " XGBoostPipeline\n", + " 2\n", + " RFClassificationPipeline\n", " 0.007623\n", " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", " 3\n", - " 3\n", + " 0\n", " XGBoostPipeline\n", - " 0.007623\n", + " NaN\n", " False\n", - " {'eta': 0.5288949197529046, 'min_child_weight'...\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", " 4\n", - " 2\n", - " RFClassificationPipeline\n", - " 0.007623\n", + " 3\n", + " XGBoostPipeline\n", + " NaN\n", " False\n", - " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", " \n", "\n", @@ -260,16 +271,16 @@ " id pipeline_name score high_variance_cv \\\n", "0 1 XGBoostPipeline 0.007623 False \n", "1 4 LogisticRegressionPipeline 0.007623 False \n", - "2 0 XGBoostPipeline 0.007623 False \n", - "3 3 XGBoostPipeline 0.007623 False \n", - "4 2 RFClassificationPipeline 0.007623 False \n", + "2 2 RFClassificationPipeline 0.007623 False \n", + "3 0 XGBoostPipeline NaN False \n", + "4 3 XGBoostPipeline NaN False \n", "\n", " parameters \n", "0 {'eta': 0.38438170729269994, 'min_child_weight... \n", "1 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "2 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", - "4 {'n_estimators': 569, 'max_depth': 22, 'impute... " + "2 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", + "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "4 {'eta': 0.5288949197529046, 'min_child_weight'... " ] }, "execution_count": 6, @@ -315,30 +326,40 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\n", + "Problem Types: Binary Classification, Multiclass Classification\n", + "Model Type: XGBoost Classifier\n", + "Objective to Optimize: Fraud Cost (lower is better)\n", "\n", - "Pipeline Name: XGBoost w/ imputation\n", - "Model type: ModelTypes.XGBOOST\n", - "Objective: Fraud Cost (lower is better)\n", - "Total training time (including CV): 205.8 seconds\n", "\n", - "Parameters\n", - "==========\n", - "• eta: 0.38438170729269994\n", - "• min_child_weight: 3.677811458900251\n", - "• max_depth: 13\n", - "• impute_strategy: median\n", - "• percent_features: 0.793807787701838\n", + "Pipeline Steps\n", + "===============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : median\n", + "3. RF Classifier Select From Model\n", + "\t * percent_features : 0.793807787701838\n", + "\t * threshold : -inf\n", + "4. XGBoost Classifier\n", + "\t * eta : 0.38438170729269994\n", + "\t * max_depth : 13\n", + "\t * min_child_weight : 3.677811458900251\n", + "\n", + "Training\n", + "========\n", + "Training for Binary Classification problems.\n", + "Total training time (including CV): 26.4 seconds\n", "\n", "Cross Validation\n", - "=================\n", + "----------------\n", " Fraud Cost AUC Recall Precision # Training # Testing\n", - "0 0.008 0.831 0.264 0.152 53328.000 26665.000\n", - "1 0.008 0.833 0.264 0.152 53328.000 26665.000\n", - "2 0.008 0.836 0.264 0.152 53330.000 26663.000\n", - "mean 0.008 0.834 0.264 0.152 - -\n", + "0 0.008 0.864 0.264 0.152 53328.000 26665.000\n", + "1 0.008 0.862 0.264 0.152 53328.000 26665.000\n", + "2 0.008 0.867 0.264 0.152 53330.000 26663.000\n", + "mean 0.008 0.864 0.264 0.152 - -\n", "std 0.000 0.003 0.000 0.000 - -\n", "coef of var 0.003 0.003 0.000 0.000 - -\n" ] @@ -365,7 +386,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -392,9 +413,9 @@ { "data": { "text/plain": [ - "(0.007626457064581641,\n", - " OrderedDict([('AUC', 0.8336438887334185),\n", - " ('Fraud Cost', 0.007626457064581641)]))" + "(0.007626457502689945,\n", + " OrderedDict([('AUC', 0.8691817003558158),\n", + " ('Fraud Cost', 0.007626457502689945)]))" ] }, "execution_count": 10, @@ -430,16 +451,27 @@ "\n", "Optimizing for AUC. Greater score is better.\n", "\n", - "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", - "\n", - "Possible model types: xgboost, linear_model, random_forest\n", + "Searching up to 5 pipelines. \n", + "Possible model types: random_forest, xgboost, linear_model\n", "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:01:49\n", - "✔ XGBoost w/ imputation: 20%|██ | Elapsed:06:13\n", - "✔ Random Forest w/ imputation: 40%|████ | Elapsed:07:15\n", - "✔ XGBoost w/ imputation: 60%|██████ | Elapsed:09:38\n", - "✔ LogisticRegression w/ imputation + ... 80%|████████ | Elapsed:10:00\n", - "✔ LogisticRegression w/ imputation + ... 100%|██████████| Elapsed:10:00\n", + "▹ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields card_id, store_id, amount, lat\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields card_id, store_id, amount, lat\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields card_id, store_id, amount, lng\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:14\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:41\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:02:59\n", + "▹ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:59DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields store_id, amount\n", + "DataFrame.dtypes for data must be int, float or bool.\n", + " Did not expect the data types in fields amount, lat\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:03:14\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:03:40\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:03:40\n", "\n", "✔ Optimization finished\n" ] @@ -504,35 +536,35 @@ " \n", " \n", " 1\n", - " 0\n", + " 1\n", " XGBoostPipeline\n", - " 0.849826\n", + " 0.867186\n", " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", " \n", " 2\n", - " 1\n", - " XGBoostPipeline\n", - " 0.840634\n", + " 4\n", + " LogisticRegressionPipeline\n", + " 0.831181\n", " False\n", - " {'eta': 0.38438170729269994, 'min_child_weight...\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", " 3\n", - " 3\n", + " 0\n", " XGBoostPipeline\n", - " 0.839091\n", + " NaN\n", " False\n", - " {'eta': 0.5288949197529046, 'min_child_weight'...\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", " 4\n", - " 4\n", - " LogisticRegressionPipeline\n", - " 0.831181\n", + " 3\n", + " XGBoostPipeline\n", + " NaN\n", " False\n", - " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", " \n", "\n", @@ -541,17 +573,17 @@ "text/plain": [ " id pipeline_name score high_variance_cv \\\n", "0 2 RFClassificationPipeline 0.873053 False \n", - "1 0 XGBoostPipeline 0.849826 False \n", - "2 1 XGBoostPipeline 0.840634 False \n", - "3 3 XGBoostPipeline 0.839091 False \n", - "4 4 LogisticRegressionPipeline 0.831181 False \n", + "1 1 XGBoostPipeline 0.867186 False \n", + "2 4 LogisticRegressionPipeline 0.831181 False \n", + "3 0 XGBoostPipeline NaN False \n", + "4 3 XGBoostPipeline NaN False \n", "\n", " parameters \n", "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "2 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", - "4 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... " + "1 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "2 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "4 {'eta': 0.5288949197529046, 'min_child_weight'... " ] }, "execution_count": 12, @@ -571,7 +603,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 13, diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb index b59f225e1d..c947e9ddba 100644 --- a/docs/source/index.ipynb +++ b/docs/source/index.ipynb @@ -110,16 +110,15 @@ "\n", "Optimizing for F1. Greater score is better.\n", "\n", - "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. \n", + "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "Possible model types: linear_model, random_forest, xgboost\n", - "\n", - "✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n", - "✔ XGBoost w/ imputation: 20%|██ | Elapsed:00:00\n", - "✔ Random Forest w/ imputation: 40%|████ | Elapsed:00:07\n", - "✔ XGBoost w/ imputation: 60%|██████ | Elapsed:00:07\n", - "✔ LogisticRegression w/ imputation + ... 80%|████████ | Elapsed:00:11\n", - "✔ LogisticRegression w/ imputation + ... 100%|██████████| Elapsed:00:11\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:01\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:00:12\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:00:12\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:22\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:22\n", "\n", "✔ Optimization finished\n" ] @@ -192,7 +191,7 @@ " 2\n", " 1\n", " XGBoostPipeline\n", - " 0.966802\n", + " 0.970312\n", " False\n", " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", @@ -200,7 +199,7 @@ " 3\n", " 0\n", " XGBoostPipeline\n", - " 0.963458\n", + " 0.959800\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", @@ -208,7 +207,7 @@ " 4\n", " 3\n", " XGBoostPipeline\n", - " 0.963440\n", + " 0.957570\n", " False\n", " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", @@ -220,9 +219,9 @@ " id pipeline_name score high_variance_cv \\\n", "0 2 RFClassificationPipeline 0.973822 False \n", "1 4 LogisticRegressionPipeline 0.971963 False \n", - "2 1 XGBoostPipeline 0.966802 False \n", - "3 0 XGBoostPipeline 0.963458 False \n", - "4 3 XGBoostPipeline 0.963440 False \n", + "2 1 XGBoostPipeline 0.970312 False \n", + "3 0 XGBoostPipeline 0.959800 False \n", + "4 3 XGBoostPipeline 0.957570 False \n", "\n", " parameters \n", "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", @@ -259,32 +258,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\n", + "Problem Types: Binary Classification, Multiclass Classification\n", + "Model Type: XGBoost Classifier\n", + "Objective to Optimize: F1 (greater is better)\n", "\n", - "Pipeline Name: XGBoost w/ imputation\n", - "Model type: ModelTypes.XGBOOST\n", - "Objective: F1 (greater is better)\n", - "Total training time (including CV): 0.2 seconds\n", "\n", - "Parameters\n", - "==========\n", - "• eta: 0.5288949197529046\n", - "• min_child_weight: 6.112401049845392\n", - "• max_depth: 6\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.34402219881309576\n", + "Pipeline Steps\n", + "===============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : most_frequent\n", + "3. RF Classifier Select From Model\n", + "\t * percent_features : 0.34402219881309576\n", + "\t * threshold : -inf\n", + "4. XGBoost Classifier\n", + "\t * eta : 0.5288949197529046\n", + "\t * max_depth : 6\n", + "\t * min_child_weight : 6.112401049845392\n", + "\n", + "Training\n", + "========\n", + "Training for Binary Classification problems.\n", + "Total training time (including CV): 0.2 seconds\n", "\n", "Cross Validation\n", - "=================\n", + "----------------\n", " F1 Precision Recall AUC Log Loss MCC # Training # Testing\n", - "0 0.974 0.959 0.974 0.995 0.104 0.930 303.000 152.000\n", - "1 0.958 0.948 0.958 0.984 0.129 0.887 303.000 152.000\n", - "2 0.958 0.958 0.958 0.990 0.128 0.886 304.000 151.000\n", - "mean 0.963 0.955 0.963 0.990 0.120 0.901 - -\n", - "std 0.009 0.006 0.009 0.005 0.014 0.025 - -\n", - "coef of var 0.010 0.006 0.010 0.006 0.117 0.028 - -\n" + "0 0.974 0.959 0.974 0.995 0.100 0.930 303.000 152.000\n", + "1 0.946 0.967 0.946 0.985 0.147 0.863 303.000 152.000\n", + "2 0.952 0.957 0.952 0.987 0.155 0.873 304.000 151.000\n", + "mean 0.958 0.961 0.958 0.989 0.134 0.889 - -\n", + "std 0.015 0.005 0.015 0.006 0.030 0.036 - -\n", + "coef of var 0.015 0.005 0.015 0.006 0.222 0.041 - -\n" ] } ], @@ -406,7 +415,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.7.4" } }, "nbformat": 4, From 6e83c4fd82a1ac109613cfe5bd092dc363d3ae46 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 10:22:17 -0500 Subject: [PATCH 09/17] removing newline --- evalml/pipelines/pipeline_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 8973fb3a2e..8fbf83004f 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -96,7 +96,7 @@ def describe(self, return_dict=False): self.logger.log(objective_string) # Summary of steps - self.logger.log_subtitle("\nPipeline Steps") + self.logger.log_subtitle("Pipeline Steps") for number, component in enumerate(self.component_list, 1): component_string = str(number) + ". " + component.name self.logger.log(component_string) From 7752fa3b531552eef19af595cf97eda7d6b6ab4f Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 15:47:37 -0500 Subject: [PATCH 10/17] updating to fix warning --- docs/Fraud Prediction Demo.ipynb | 432 ++++++++++++------ .../feature_selection/feature_selector.py | 8 +- .../components/transformers/transformer.py | 4 +- 3 files changed, 291 insertions(+), 153 deletions(-) diff --git a/docs/Fraud Prediction Demo.ipynb b/docs/Fraud Prediction Demo.ipynb index 2d507986e6..4ea956f78d 100644 --- a/docs/Fraud Prediction Demo.ipynb +++ b/docs/Fraud Prediction Demo.ipynb @@ -124,11 +124,183 @@ "\n", "Optimizing for Fraud Cost. Lower score is better.\n", "\n", - "Searching up to 20 pipelines. Will stop searching for new pipelines after 120 seconds.\n", + "Searching up to 20 pipelines. \n", + "Will stop searching for new pipelines after 120 seconds.\n", "\n", - "Possible model types: linear_model, xgboost, random_forest\n", + "Possible model types: linear_model, random_forest, xgboost\n", + "\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:01\n", + "▹ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:01" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n", + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:05\n", + "▹ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:05" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:38\n", + "▹ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:38" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:41\n", + "▹ Logistic Regression Classifier w/ O... 20%|██ | Elapsed:00:41" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Logistic Regression Classifier w/ O... 20%|██ | Elapsed:00:49\n", + "▹ XGBoost Classifier w/ One Hot Encod... 25%|██▌ | Elapsed:00:49" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ XGBoost Classifier w/ One Hot Encod... 25%|██▌ | Elapsed:00:52\n", + "▹ Logistic Regression Classifier w/ O... 30%|███ | Elapsed:00:52" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Logistic Regression Classifier w/ O... 30%|███ | Elapsed:01:00\n", + "✔ XGBoost Classifier w/ One Hot Encod... 35%|███▌ | Elapsed:01:06\n", + "▹ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:01:06" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:01:14\n", + "✔ Logistic Regression Classifier w/ O... 45%|████▌ | Elapsed:01:23\n", + "✔ XGBoost Classifier w/ One Hot Encod... 50%|█████ | Elapsed:01:24\n", + "✔ Logistic Regression Classifier w/ O... 55%|█████▌ | Elapsed:01:32\n", + "▹ Random Forest Classifier w/ One Hot... 60%|██████ | Elapsed:01:32" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Random Forest Classifier w/ One Hot... 60%|██████ | Elapsed:01:47\n", + "▹ Logistic Regression Classifier w/ O... 65%|██████▌ | Elapsed:01:47" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Logistic Regression Classifier w/ O... 65%|██████▌ | Elapsed:01:56\n", + "✔ XGBoost Classifier w/ One Hot Encod... 70%|███████ | Elapsed:01:57\n", + "▹ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:01:57" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✔ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:02:18\n", + "✔ Random Forest Classifier w/ One Hot... 80%|████████ | Elapsed:02:18\n", "\n", - "Testing XGBoost w/ imputation: 100%|██████████| 20/20 [02:02<00:00, 6.11s/it] \n", + "\n", + "Max time elapsed. Stopping search early.\n", "\n", "✔ Optimization finished\n" ] @@ -187,163 +359,131 @@ " \n", " \n", " 0\n", - " 18\n", + " 14\n", " XGBoostPipeline\n", " 0.007614\n", " False\n", - " {'eta': 0.1496748671836832, 'min_child_weight'...\n", + " {'eta': 0.264555612104627, 'min_child_weight':...\n", " \n", " \n", " 1\n", - " 14\n", + " 12\n", " RFClassificationPipeline\n", " 0.007614\n", " False\n", - " {'n_estimators': 609, 'max_depth': 7, 'impute_...\n", + " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", " \n", " \n", " 2\n", - " 17\n", - " XGBoostPipeline\n", - " 0.007614\n", + " 2\n", + " RFClassificationPipeline\n", + " 0.007615\n", " False\n", - " {'eta': 0.264555612104627, 'min_child_weight':...\n", + " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", " 3\n", - " 8\n", + " 0\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007615\n", " False\n", - " {'eta': 0.6481718720511973, 'min_child_weight'...\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", " 4\n", - " 16\n", - " RFClassificationPipeline\n", - " 0.007614\n", + " 3\n", + " XGBoostPipeline\n", + " 0.007615\n", " False\n", - " {'n_estimators': 715, 'max_depth': 7, 'impute_...\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", " \n", " 5\n", - " 0\n", + " 5\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007615\n", " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " {'eta': 0.6481718720511973, 'min_child_weight'...\n", " \n", " \n", " 6\n", - " 15\n", + " 10\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007615\n", " False\n", " {'eta': 0.7206326547259169, 'min_child_weight'...\n", " \n", " \n", " 7\n", - " 1\n", + " 7\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007615\n", " False\n", - " {'eta': 0.38438170729269994, 'min_child_weight...\n", + " {'eta': 0.9786183422327642, 'min_child_weight'...\n", " \n", " \n", " 8\n", - " 19\n", + " 1\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007616\n", " False\n", - " {'eta': 0.35950790057378607, 'min_child_weight...\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", " \n", " 9\n", - " 13\n", - " XGBoostPipeline\n", - " 0.007614\n", + " 15\n", + " RFClassificationPipeline\n", + " 0.007616\n", " False\n", - " {'eta': 0.9786183422327642, 'min_child_weight'...\n", + " {'n_estimators': 609, 'max_depth': 7, 'impute_...\n", " \n", " \n", " 10\n", " 4\n", " LogisticRegressionPipeline\n", - " 0.007614\n", + " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", " 11\n", " 6\n", " LogisticRegressionPipeline\n", - " 0.007614\n", + " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 8.123565600467177, 'imp...\n", + " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", " \n", " \n", " 12\n", - " 10\n", - " RFClassificationPipeline\n", - " 0.007614\n", + " 8\n", + " LogisticRegressionPipeline\n", + " 0.007616\n", " False\n", - " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", + " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", " \n", " \n", " 13\n", - " 12\n", + " 9\n", " LogisticRegressionPipeline\n", - " 0.007615\n", + " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 8.702171711000782, 'imp...\n", + " {'penalty': 'l2', 'C': 8.123565600467177, 'imp...\n", " \n", " \n", " 14\n", - " 2\n", - " LogisticRegressionPipeline\n", - " 0.007615\n", - " False\n", - " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", - " \n", - " \n", - " 15\n", - " 3\n", - " XGBoostPipeline\n", - " 0.007615\n", - " False\n", - " {'eta': 0.5288949197529046, 'min_child_weight'...\n", - " \n", - " \n", - " 16\n", " 11\n", " LogisticRegressionPipeline\n", - " 0.007615\n", - " False\n", - " {'penalty': 'l2', 'C': 3.6887329830070748, 'im...\n", - " \n", - " \n", - " 17\n", - " 7\n", - " LogisticRegressionPipeline\n", " 0.007616\n", " False\n", " {'penalty': 'l2', 'C': 8.362426847738403, 'imp...\n", " \n", " \n", - " 18\n", - " 5\n", + " 15\n", + " 13\n", " LogisticRegressionPipeline\n", " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", - " \n", - " \n", - " 19\n", - " 9\n", - " RFClassificationPipeline\n", - " 0.007617\n", - " False\n", - " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", + " {'penalty': 'l2', 'C': 3.6887329830070748, 'im...\n", " \n", " \n", "\n", @@ -351,48 +491,40 @@ ], "text/plain": [ " id pipeline_name score high_variance_cv \\\n", - "0 18 XGBoostPipeline 0.007614 False \n", - "1 14 RFClassificationPipeline 0.007614 False \n", - "2 17 XGBoostPipeline 0.007614 False \n", - "3 8 XGBoostPipeline 0.007614 False \n", - "4 16 RFClassificationPipeline 0.007614 False \n", - "5 0 XGBoostPipeline 0.007614 False \n", - "6 15 XGBoostPipeline 0.007614 False \n", - "7 1 XGBoostPipeline 0.007614 False \n", - "8 19 XGBoostPipeline 0.007614 False \n", - "9 13 XGBoostPipeline 0.007614 False \n", - "10 4 LogisticRegressionPipeline 0.007614 False \n", - "11 6 LogisticRegressionPipeline 0.007614 False \n", - "12 10 RFClassificationPipeline 0.007614 False \n", - "13 12 LogisticRegressionPipeline 0.007615 False \n", - "14 2 LogisticRegressionPipeline 0.007615 False \n", - "15 3 XGBoostPipeline 0.007615 False \n", - "16 11 LogisticRegressionPipeline 0.007615 False \n", - "17 7 LogisticRegressionPipeline 0.007616 False \n", - "18 5 LogisticRegressionPipeline 0.007616 False \n", - "19 9 RFClassificationPipeline 0.007617 False \n", + "0 14 XGBoostPipeline 0.007614 False \n", + "1 12 RFClassificationPipeline 0.007614 False \n", + "2 2 RFClassificationPipeline 0.007615 False \n", + "3 0 XGBoostPipeline 0.007615 False \n", + "4 3 XGBoostPipeline 0.007615 False \n", + "5 5 XGBoostPipeline 0.007615 False \n", + "6 10 XGBoostPipeline 0.007615 False \n", + "7 7 XGBoostPipeline 0.007615 False \n", + "8 1 XGBoostPipeline 0.007616 False \n", + "9 15 RFClassificationPipeline 0.007616 False \n", + "10 4 LogisticRegressionPipeline 0.007616 False \n", + "11 6 LogisticRegressionPipeline 0.007616 False \n", + "12 8 LogisticRegressionPipeline 0.007616 False \n", + "13 9 LogisticRegressionPipeline 0.007616 False \n", + "14 11 LogisticRegressionPipeline 0.007616 False \n", + "15 13 LogisticRegressionPipeline 0.007616 False \n", "\n", " parameters \n", - "0 {'eta': 0.1496748671836832, 'min_child_weight'... \n", - "1 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", - "2 {'eta': 0.264555612104627, 'min_child_weight':... \n", - "3 {'eta': 0.6481718720511973, 'min_child_weight'... \n", - "4 {'n_estimators': 715, 'max_depth': 7, 'impute_... \n", - "5 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "0 {'eta': 0.264555612104627, 'min_child_weight':... \n", + "1 {'n_estimators': 369, 'max_depth': 10, 'impute... \n", + "2 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", + "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "4 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "5 {'eta': 0.6481718720511973, 'min_child_weight'... \n", "6 {'eta': 0.7206326547259169, 'min_child_weight'... \n", - "7 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "8 {'eta': 0.35950790057378607, 'min_child_weight... \n", - "9 {'eta': 0.9786183422327642, 'min_child_weight'... \n", - "10 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", - "11 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", - "12 {'n_estimators': 369, 'max_depth': 10, 'impute... \n", - "13 {'penalty': 'l2', 'C': 8.702171711000782, 'imp... \n", - "14 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "15 {'eta': 0.5288949197529046, 'min_child_weight'... \n", - "16 {'penalty': 'l2', 'C': 3.6887329830070748, 'im... \n", - "17 {'penalty': 'l2', 'C': 8.362426847738403, 'imp... \n", - "18 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", - "19 {'n_estimators': 569, 'max_depth': 22, 'impute... " + "7 {'eta': 0.9786183422327642, 'min_child_weight'... \n", + "8 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "9 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", + "10 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "11 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", + "12 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", + "13 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", + "14 {'penalty': 'l2', 'C': 8.362426847738403, 'imp... \n", + "15 {'penalty': 'l2', 'C': 3.6887329830070748, 'im... " ] }, "execution_count": 6, @@ -447,52 +579,54 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m************************\u001b[0m\n", - "\u001b[1m* Pipeline Description *\u001b[0m\n", - "\u001b[1m************************\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", + "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", + "\u001b[1m********************************************************************************************\u001b[0m\n", "\n", - "Pipeline Name: XGBoost w/ imputation\n", - "Model type: xgboost\n", - "Objective: Fraud Cost (lower is better)\n", - "Total training time (including CV): 4.0 seconds\n", + "Problem Types: Binary Classification, Multiclass Classification\n", + "Model Type: XGBoost Classifier\n", + "Objective to Optimize: Fraud Cost (lower is better)\n", "\n", - "Parameters\n", - "==========\n", - "• eta: 0.5928446182250184\n", - "• min_child_weight: 8.598391737229157\n", - "• max_depth: 4\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.6273280598181127\n", + "Pipeline Steps\n", + "==============\n", + "1. One Hot Encoder\n", + "2. Simple Imputer\n", + "\t * impute_strategy : most_frequent\n", + "3. RF Classifier Select From Model\n", + "\t * percent_features : 0.6273280598181127\n", + "\t * threshold : -inf\n", + "4. XGBoost Classifier\n", + "\t * eta : 0.5928446182250184\n", + "\t * max_depth : 4\n", + "\t * min_child_weight : 8.598391737229157\n", + "\n", + "Training\n", + "========\n", + "Training for Binary Classification problems.\n", + "Total training time (including CV): 1.9 seconds\n", "\n", "Cross Validation\n", - "=================\n", - " F1 Precision Recall AUC Log Loss Fraud Cost # Training # Testing\n", - "0 0.264 0.152 0.264 0.856 0.188 0.008 13332.000 6666.000\n", - "1 0.264 0.152 0.264 0.844 0.190 0.008 13332.000 6666.000\n", - "2 0.264 0.152 0.264 0.836 0.198 0.008 13332.000 6666.000\n", - "mean 0.264 0.152 0.264 0.845 0.192 0.008 - -\n", - "std 0.000 0.000 0.000 0.010 0.006 0.000 - -\n", - "coef of var 0.000 0.000 0.000 0.012 0.030 0.002 - -\n" + "----------------\n", + " Fraud Cost F1 Precision Recall AUC Log Loss MCC # Training # Testing\n", + "0 0.008 0.263 0.152 0.263 0.855 0.188 -0.007 13332.000 6666.000\n", + "1 0.008 0.263 0.152 0.263 0.844 0.190 -0.000 13332.000 6666.000\n", + "2 0.008 0.264 0.152 0.264 0.832 0.200 0.000 13332.000 6666.000\n", + "mean 0.008 0.264 0.152 0.264 0.844 0.192 -0.002 - -\n", + "std 0.000 0.000 0.000 0.000 0.012 0.006 0.004 - -\n", + "coef of var 0.002 0.000 0.000 0.000 0.014 0.033 -1.603 - -\n" ] } ], "source": [ "clf.describe_pipeline(0)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -511,7 +645,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index 44bd7f75e7..ba09e0264f 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -36,7 +36,9 @@ def transform(self, X): try: X_t = self._component_obj.transform(X) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()) + X_dtypes = X.dtypes.to_dict() + col_types = {key: X_dtypes[key] for key in self.get_names()} + X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()).astype(col_types) return X_t except AttributeError: raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") @@ -58,7 +60,9 @@ def fit_transform(self, X, y=None): try: X_t = self._component_obj.fit_transform(X, y) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()) + X_dtypes = X.dtypes.to_dict() + col_types = {key: X_dtypes[key] for key in self.get_names()} + X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()).astype(col_types) return X_t except AttributeError: raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 64a3324678..73e1324a73 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -20,7 +20,7 @@ def transform(self, X): try: X_t = self._component_obj.transform(X) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, columns=X.columns) + X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict()) return X_t except AttributeError: raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") @@ -37,7 +37,7 @@ def fit_transform(self, X, y=None): try: X_t = self._component_obj.fit_transform(X, y) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, columns=X.columns) + X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict()) return X_t except AttributeError: raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") From cb8fd963ca6a9c7b39cc2c44523150e1ba175914 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 16:49:01 -0500 Subject: [PATCH 11/17] moving typing from transformer to simpleimputer --- .../feature_selection/feature_selector.py | 10 +++-- .../transformers/imputers/simple_imputer.py | 38 +++++++++++++++++++ .../components/transformers/transformer.py | 4 +- .../pipeline_tests/test_linear_regression.py | 12 ++++-- 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index ba09e0264f..8d2fd19eb2 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -37,8 +37,9 @@ def transform(self, X): X_t = self._component_obj.transform(X) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): X_dtypes = X.dtypes.to_dict() - col_types = {key: X_dtypes[key] for key in self.get_names()} - X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()).astype(col_types) + selected_col_names = self.get_names() + col_types = {key: X_dtypes[key] for key in selected_col_names} + X_t = pd.DataFrame(X_t, columns=selected_col_names, index=X.index).astype(col_types) return X_t except AttributeError: raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") @@ -61,8 +62,9 @@ def fit_transform(self, X, y=None): X_t = self._component_obj.fit_transform(X, y) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): X_dtypes = X.dtypes.to_dict() - col_types = {key: X_dtypes[key] for key in self.get_names()} - X_t = pd.DataFrame(X_t, index=X.index, columns=self.get_names()).astype(col_types) + selected_col_names = self.get_names() + col_types = {key: X_dtypes[key] for key in selected_col_names} + X_t = pd.DataFrame(X_t, columns=selected_col_names, index=X.index).astype(col_types) return X_t except AttributeError: raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 2d77b0b8fc..ff516d7efe 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -1,3 +1,5 @@ +import pandas as pd + from sklearn.impute import SimpleImputer as SkImputer from evalml.pipelines.components import ComponentTypes @@ -17,3 +19,39 @@ def __init__(self, impute_strategy="most_frequent"): super().__init__(parameters=parameters, component_obj=imputer, random_state=0) + + def transform(self, X): + """Transforms data X + + Arguments: + X (DataFrame): Data to transform + + Returns: + DataFrame: Transformed X + """ + try: + X_t = self._component_obj.transform(X) + if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): + # skLearn's SimpleImputer loses track of column type, so we need to restore + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) + return X_t + except AttributeError: + raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") + + def fit_transform(self, X, y=None): + """Fits on X and transforms X + + Arguments: + X (DataFrame): Data to fit and transform + + Returns: + DataFrame: Transformed X + """ + try: + X_t = self._component_obj.fit_transform(X, y) + if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): + # skLearn's SimpleImputer loses track of column type, so we need to restore + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) + return X_t + except AttributeError: + raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 73e1324a73..d32c24d84f 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -20,7 +20,7 @@ def transform(self, X): try: X_t = self._component_obj.transform(X) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict()) + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index) return X_t except AttributeError: raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") @@ -37,7 +37,7 @@ def fit_transform(self, X, y=None): try: X_t = self._component_obj.fit_transform(X, y) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict()) + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index) return X_t except AttributeError: raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") diff --git a/evalml/tests/pipeline_tests/test_linear_regression.py b/evalml/tests/pipeline_tests/test_linear_regression.py index 5176b1b377..9f13719ce0 100644 --- a/evalml/tests/pipeline_tests/test_linear_regression.py +++ b/evalml/tests/pipeline_tests/test_linear_regression.py @@ -12,9 +12,8 @@ def test_linear_regression(X_y_categorical_regression): X, y = X_y_categorical_regression - - imputer = SimpleImputer(strategy='mean') enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) + imputer = SimpleImputer(strategy='mean') scaler = StandardScaler() estimator = LinearRegression(normalize=False, fit_intercept=True, n_jobs=-1) sk_pipeline = Pipeline([("encoder", enc), @@ -22,10 +21,17 @@ def test_linear_regression(X_y_categorical_regression): ("scaler", scaler), ("estimator", estimator)]) sk_pipeline.fit(X, y) + sk_score = sk_pipeline.score(X, y) objective = R2() - clf = LinearRegressionPipeline(objective=objective, number_features=len(X.columns), random_state=0, impute_strategy='mean', normalize=False, fit_intercept=True, n_jobs=-1) + clf = LinearRegressionPipeline(objective=objective, + number_features=len(X.columns), + impute_strategy='mean', + normalize=False, + fit_intercept=True, + random_state=0, + n_jobs=-1) clf.fit(X, y) clf_score = clf.score(X, y) y_pred = clf.predict(X) From 19072b37fdc2930e0054f9540a9164e65a6c45f5 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 16:50:11 -0500 Subject: [PATCH 12/17] linting --- evalml/tests/pipeline_tests/test_linear_regression.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/tests/pipeline_tests/test_linear_regression.py b/evalml/tests/pipeline_tests/test_linear_regression.py index 9f13719ce0..298693c9d7 100644 --- a/evalml/tests/pipeline_tests/test_linear_regression.py +++ b/evalml/tests/pipeline_tests/test_linear_regression.py @@ -21,7 +21,6 @@ def test_linear_regression(X_y_categorical_regression): ("scaler", scaler), ("estimator", estimator)]) sk_pipeline.fit(X, y) - sk_score = sk_pipeline.score(X, y) objective = R2() From 97c1e5c6feb6b028e2e5808c186502c984a27702 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 16:50:28 -0500 Subject: [PATCH 13/17] lint --- .../pipelines/components/transformers/imputers/simple_imputer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index ff516d7efe..db8880047c 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -1,5 +1,4 @@ import pandas as pd - from sklearn.impute import SimpleImputer as SkImputer from evalml.pipelines.components import ComponentTypes From 8c816e7b8b8195acb01b36ee4cc8084456fd23e8 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 17:09:35 -0500 Subject: [PATCH 14/17] remove dstr --- .../transformers/imputers/simple_imputer.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index db8880047c..39e23e9817 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -20,14 +20,6 @@ def __init__(self, impute_strategy="most_frequent"): random_state=0) def transform(self, X): - """Transforms data X - - Arguments: - X (DataFrame): Data to transform - - Returns: - DataFrame: Transformed X - """ try: X_t = self._component_obj.transform(X) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): @@ -38,14 +30,6 @@ def transform(self, X): raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") def fit_transform(self, X, y=None): - """Fits on X and transforms X - - Arguments: - X (DataFrame): Data to fit and transform - - Returns: - DataFrame: Transformed X - """ try: X_t = self._component_obj.fit_transform(X, y) if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): From 4ec77e8be36da6b3b272c7d5802108b3b70f6713 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Mon, 11 Nov 2019 18:03:35 -0500 Subject: [PATCH 15/17] linting --- evalml/tests/pipeline_tests/test_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index 3aa7ef1097..c600aa3770 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -154,7 +154,7 @@ def test_multi_format_creation(X_y): def test_multiple_feature_selectors(X_y): X, y = X_y - clf = PipelineBase('precision', component_list=['Simple Imputer', 'categorical_encoder', ComponentTypes.FEATURE_SELECTION_CLASSIFIER, StandardScaler(), ComponentTypes.FEATURE_SELECTION_CLASSIFIER, ComponentTypes.CLASSIFIER], n_jobs=-1, random_state=0) + clf = PipelineBase('precision', component_list=['Simple Imputer', 'categorical_encoder', ComponentTypes.FEATURE_SELECTION_CLASSIFIER, StandardScaler(), ComponentTypes.FEATURE_SELECTION_CLASSIFIER, ComponentTypes.CLASSIFIER], n_jobs=-1, random_state=0) correct_components = [SimpleImputer, OneHotEncoder, RFClassifierSelectFromModel, StandardScaler, RFClassifierSelectFromModel, LogisticRegressionClassifier] for component, correct_components in zip(clf.component_list, correct_components): assert isinstance(component, correct_components) From d1fb91e0b70e3c5472287a82a57911283f6819c0 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 12 Nov 2019 11:56:30 -0500 Subject: [PATCH 16/17] adding num features, removing try/except, refreshing notebooks --- docs/Fraud Prediction Demo.ipynb | 43 +++--- docs/source/automl/guardrails.ipynb | 14 +- docs/source/automl/regression_example.ipynb | 22 +-- docs/source/automl/search_results.ipynb | 62 ++++----- docs/source/demos/fraud.ipynb | 130 +++++++----------- docs/source/index.ipynb | 16 +-- .../transformers/imputers/simple_imputer.py | 26 ++-- evalml/pipelines/pipeline_base.py | 3 + 8 files changed, 145 insertions(+), 171 deletions(-) diff --git a/docs/Fraud Prediction Demo.ipynb b/docs/Fraud Prediction Demo.ipynb index 4ea956f78d..8585f344b2 100644 --- a/docs/Fraud Prediction Demo.ipynb +++ b/docs/Fraud Prediction Demo.ipynb @@ -127,10 +127,10 @@ "Searching up to 20 pipelines. \n", "Will stop searching for new pipelines after 120 seconds.\n", "\n", - "Possible model types: linear_model, random_forest, xgboost\n", + "Possible model types: random_forest, linear_model, xgboost\n", "\n", - "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:01\n", - "▹ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:01" + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:02\n", + "▹ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:02" ] }, { @@ -163,8 +163,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "✔ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:38\n", - "▹ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:38" + "✔ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:39\n", + "▹ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:39" ] }, { @@ -296,8 +296,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "✔ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:02:18\n", - "✔ Random Forest Classifier w/ One Hot... 80%|████████ | Elapsed:02:18\n", + "✔ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:02:19\n", + "✔ Random Forest Classifier w/ One Hot... 80%|████████ | Elapsed:02:19\n", "\n", "\n", "Max time elapsed. Stopping search early.\n", @@ -439,27 +439,27 @@ " \n", " \n", " 10\n", - " 4\n", + " 8\n", " LogisticRegressionPipeline\n", " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", + " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", " \n", " \n", " 11\n", - " 6\n", + " 4\n", " LogisticRegressionPipeline\n", " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", " 12\n", - " 8\n", + " 6\n", " LogisticRegressionPipeline\n", " 0.007616\n", " False\n", - " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", + " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", " \n", " \n", " 13\n", @@ -501,9 +501,9 @@ "7 7 XGBoostPipeline 0.007615 False \n", "8 1 XGBoostPipeline 0.007616 False \n", "9 15 RFClassificationPipeline 0.007616 False \n", - "10 4 LogisticRegressionPipeline 0.007616 False \n", - "11 6 LogisticRegressionPipeline 0.007616 False \n", - "12 8 LogisticRegressionPipeline 0.007616 False \n", + "10 8 LogisticRegressionPipeline 0.007616 False \n", + "11 4 LogisticRegressionPipeline 0.007616 False \n", + "12 6 LogisticRegressionPipeline 0.007616 False \n", "13 9 LogisticRegressionPipeline 0.007616 False \n", "14 11 LogisticRegressionPipeline 0.007616 False \n", "15 13 LogisticRegressionPipeline 0.007616 False \n", @@ -519,9 +519,9 @@ "7 {'eta': 0.9786183422327642, 'min_child_weight'... \n", "8 {'eta': 0.38438170729269994, 'min_child_weight... \n", "9 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", - "10 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "11 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", - "12 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", + "10 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", + "11 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "12 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", "13 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", "14 {'penalty': 'l2', 'C': 8.362426847738403, 'imp... \n", "15 {'penalty': 'l2', 'C': 3.6887329830070748, 'im... " @@ -579,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -593,6 +593,7 @@ "Problem Types: Binary Classification, Multiclass Classification\n", "Model Type: XGBoost Classifier\n", "Objective to Optimize: Fraud Cost (lower is better)\n", + "Number of features: 3\n", "\n", "Pipeline Steps\n", "==============\n", @@ -610,7 +611,7 @@ "Training\n", "========\n", "Training for Binary Classification problems.\n", - "Total training time (including CV): 1.9 seconds\n", + "Total training time (including CV): 2.0 seconds\n", "\n", "Cross Validation\n", "----------------\n", diff --git a/docs/source/automl/guardrails.ipynb b/docs/source/automl/guardrails.ipynb index 0e7688575b..545cca7e16 100644 --- a/docs/source/automl/guardrails.ipynb +++ b/docs/source/automl/guardrails.ipynb @@ -50,8 +50,8 @@ "Possible model types: linear_model\n", "\n", "WARNING: Possible label leakage: leaked_feature, leaked_feature_2\n", - "✔ Logistic Regression Classifier w/ O... 0%| | Elapsed:00:08\n", - "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:08\n", + "✔ Logistic Regression Classifier w/ O... 0%| | Elapsed:00:07\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:07\n", "\n", "✔ Optimization finished\n" ] @@ -182,7 +182,7 @@ "Optimizing for Precision. Greater score is better.\n", "\n", "Searching up to 1 pipelines. \n", - "Possible model types: xgboost, linear_model, random_forest\n", + "Possible model types: linear_model, random_forest, xgboost\n", "\n", "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", "✔ XGBoost Classifier w/ One Hot Encod... 100%|██████████| Elapsed:00:00\n", @@ -227,10 +227,10 @@ "Problem Types: Binary Classification, Multiclass Classification\n", "Model Type: XGBoost Classifier\n", "Objective to Optimize: Precision (greater is better)\n", - "\n", + "Number of features: 18\n", "\n", "Pipeline Steps\n", - "===============\n", + "==============\n", "1. One Hot Encoder\n", "2. Simple Imputer\n", "\t * impute_strategy : most_frequent\n", @@ -245,7 +245,7 @@ "Training\n", "========\n", "Training for Binary Classification problems.\n", - "Total training time (including CV): 0.4 seconds\n", + "Total training time (including CV): 0.5 seconds\n", "\n", "Cross Validation\n", "----------------\n", @@ -379,7 +379,7 @@ "Optimizing for Recall. Greater score is better.\n", "\n", "Searching up to 3 pipelines. \n", - "Possible model types: xgboost, linear_model, random_forest\n", + "Possible model types: linear_model, random_forest, xgboost\n", "\n", "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", "✔ XGBoost Classifier w/ One Hot Encod... 33%|███▎ | Elapsed:00:00\n", diff --git a/docs/source/automl/regression_example.ipynb b/docs/source/automl/regression_example.ipynb index 98019ca5e3..7005b8b7e2 100644 --- a/docs/source/automl/regression_example.ipynb +++ b/docs/source/automl/regression_example.ipynb @@ -25,12 +25,12 @@ "Searching up to 5 pipelines. \n", "Possible model types: linear_model, random_forest\n", "\n", - "✔ Random Forest Regressor w/ One Hot ... 0%| | Elapsed:00:06\n", - "✔ Random Forest Regressor w/ One Hot ... 20%|██ | Elapsed:00:10\n", - "✔ Linear Regressor w/ One Hot Encoder... 40%|████ | Elapsed:00:10\n", - "✔ Random Forest Regressor w/ One Hot ... 40%|████ | Elapsed:00:16\n", - "✔ Random Forest Regressor w/ One Hot ... 80%|████████ | Elapsed:00:30\n", - "✔ Random Forest Regressor w/ One Hot ... 100%|██████████| Elapsed:00:30\n", + "✔ Random Forest Regressor w/ One Hot ... 0%| | Elapsed:00:05\n", + "✔ Random Forest Regressor w/ One Hot ... 20%|██ | Elapsed:00:09\n", + "✔ Linear Regressor w/ One Hot Encoder... 40%|████ | Elapsed:00:09\n", + "✔ Random Forest Regressor w/ One Hot ... 40%|████ | Elapsed:00:15\n", + "✔ Random Forest Regressor w/ One Hot ... 80%|████████ | Elapsed:00:21\n", + "✔ Random Forest Regressor w/ One Hot ... 100%|██████████| Elapsed:00:21\n", "\n", "✔ Optimization finished\n" ] @@ -160,7 +160,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -180,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -208,10 +208,10 @@ "Problem Types: Regression\n", "Model Type: Random Forest\n", "Objective to Optimize: R2 (greater is better)\n", - "\n", + "Number of features: 8\n", "\n", "Pipeline Steps\n", - "===============\n", + "==============\n", "1. One Hot Encoder\n", "2. Simple Imputer\n", "\t * impute_strategy : most_frequent\n", @@ -225,7 +225,7 @@ "Training\n", "========\n", "Training for Regression problems.\n", - "Total training time (including CV): 6.0 seconds\n", + "Total training time (including CV): 5.6 seconds\n", "\n", "Cross Validation\n", "----------------\n", diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb index 71a2d572b9..262b7bc9f7 100644 --- a/docs/source/automl/search_results.ipynb +++ b/docs/source/automl/search_results.ipynb @@ -25,7 +25,7 @@ "Optimizing for F1. Greater score is better.\n", "\n", "Searching up to 10 pipelines. \n", - "Possible model types: linear_model, xgboost, random_forest\n", + "Possible model types: xgboost, random_forest, linear_model\n", "\n", "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", "✔ XGBoost Classifier w/ One Hot Encod... 10%|█ | Elapsed:00:00\n", @@ -33,11 +33,11 @@ "✔ XGBoost Classifier w/ One Hot Encod... 30%|███ | Elapsed:00:06\n", "✔ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:00:14\n", "✔ XGBoost Classifier w/ One Hot Encod... 50%|█████ | Elapsed:00:14\n", - "✔ Logistic Regression Classifier w/ O... 60%|██████ | Elapsed:00:22\n", + "✔ Logistic Regression Classifier w/ O... 60%|██████ | Elapsed:00:21\n", "✔ XGBoost Classifier w/ One Hot Encod... 70%|███████ | Elapsed:00:22\n", - "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:32\n", - "✔ Logistic Regression Classifier w/ O... 90%|█████████ | Elapsed:00:39\n", - "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:39\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:29\n", + "✔ Logistic Regression Classifier w/ O... 90%|█████████ | Elapsed:00:37\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:37\n", "\n", "✔ Optimization finished\n" ] @@ -239,10 +239,10 @@ "Problem Types: Binary Classification, Multiclass Classification\n", "Model Type: XGBoost Classifier\n", "Objective to Optimize: F1 (greater is better)\n", - "\n", + "Number of features: 18\n", "\n", "Pipeline Steps\n", - "===============\n", + "==============\n", "1. One Hot Encoder\n", "2. Simple Imputer\n", "\t * impute_strategy : most_frequent\n", @@ -291,7 +291,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -319,7 +319,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -547,7 +547,7 @@ " ('MCC', 0.9231826763268304),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.2018277645111084},\n", + " 'training_time': 0.248244047164917},\n", " 1: {'id': 1,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.38438170729269994,\n", @@ -582,7 +582,7 @@ " ('MCC', 0.9218075091290715),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.26802611351013184},\n", + " 'training_time': 0.29195380210876465},\n", " 2: {'id': 2,\n", " 'pipeline_name': 'RFClassificationPipeline',\n", " 'parameters': {'n_estimators': 569,\n", @@ -616,7 +616,7 @@ " ('MCC', 0.9208800271662652),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 6.220675945281982},\n", + " 'training_time': 6.06977105140686},\n", " 3: {'id': 3,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.5288949197529046,\n", @@ -651,7 +651,7 @@ " ('MCC', 0.8783921421654207),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.17779779434204102},\n", + " 'training_time': 0.20792675018310547},\n", " 4: {'id': 4,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -664,7 +664,7 @@ " ('Precision', 0.9745762711864406),\n", " ('Recall', 0.970464135021097),\n", " ('AUC', 0.9885193514025328),\n", - " ('Log Loss', 0.19432945908194826),\n", + " ('Log Loss', 0.1943294590819038),\n", " ('MCC', 0.9215733295732883),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -672,7 +672,7 @@ " ('Precision', 0.952),\n", " ('Recall', 0.9754098360655737),\n", " ('AUC', 0.9849686353414605),\n", - " ('Log Loss', 0.1533799764176718),\n", + " ('Log Loss', 0.1533799764176819),\n", " ('MCC', 0.933568045604951),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -680,11 +680,11 @@ " ('Precision', 0.991304347826087),\n", " ('Recall', 0.9743589743589743),\n", " ('AUC', 0.990516206482593),\n", - " ('Log Loss', 0.1164316714613046),\n", + " ('Log Loss', 0.1164316714613053),\n", " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 7.575798034667969},\n", + " 'training_time': 7.461816072463989},\n", " 5: {'id': 5,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.6481718720511973,\n", @@ -719,7 +719,7 @@ " ('MCC', 0.9218075091290715),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.3415100574493408},\n", + " 'training_time': 0.33750486373901367},\n", " 6: {'id': 6,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -732,7 +732,7 @@ " ('Precision', 0.9747899159663865),\n", " ('Recall', 0.9747899159663865),\n", " ('AUC', 0.9889927802106758),\n", - " ('Log Loss', 0.174912415672324),\n", + " ('Log Loss', 0.17491241567239438),\n", " ('MCC', 0.932536394839626),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -740,7 +740,7 @@ " ('Precision', 0.952),\n", " ('Recall', 0.9754098360655737),\n", " ('AUC', 0.9870990649781038),\n", - " ('Log Loss', 0.13982009938626028),\n", + " ('Log Loss', 0.13982009938625542),\n", " ('MCC', 0.933568045604951),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -748,11 +748,11 @@ " ('Precision', 0.991304347826087),\n", " ('Recall', 0.9743589743589743),\n", " ('AUC', 0.990516206482593),\n", - " ('Log Loss', 0.11096455834029183),\n", + " ('Log Loss', 0.1109645583402926),\n", " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 7.9174652099609375},\n", + " 'training_time': 7.343135118484497},\n", " 7: {'id': 7,\n", " 'pipeline_name': 'XGBoostPipeline',\n", " 'parameters': {'eta': 0.9786183422327642,\n", @@ -787,7 +787,7 @@ " ('MCC', 0.9112159507396058),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 0.2536630630493164},\n", + " 'training_time': 0.26775383949279785},\n", " 8: {'id': 8,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -800,7 +800,7 @@ " ('Precision', 0.9833333333333333),\n", " ('Recall', 0.9874476987447698),\n", " ('AUC', 0.994910640312463),\n", - " ('Log Loss', 0.08726565374201165),\n", + " ('Log Loss', 0.08726565374201126),\n", " ('MCC', 0.9662335358054943),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -808,7 +808,7 @@ " ('Precision', 0.952),\n", " ('Recall', 0.9754098360655737),\n", " ('AUC', 0.9979879275653923),\n", - " ('Log Loss', 0.07645591278007538),\n", + " ('Log Loss', 0.0764559127800754),\n", " ('MCC', 0.933568045604951),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -816,11 +816,11 @@ " ('Precision', 0.9913793103448276),\n", " ('Recall', 0.9787234042553192),\n", " ('AUC', 0.9903961584633854),\n", - " ('Log Loss', 0.09774553003325112),\n", + " ('Log Loss', 0.09774553003325108),\n", " ('MCC', 0.9443109474170326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 9.234977006912231},\n", + " 'training_time': 7.57702112197876},\n", " 9: {'id': 9,\n", " 'pipeline_name': 'LogisticRegressionPipeline',\n", " 'parameters': {'penalty': 'l2',\n", @@ -833,7 +833,7 @@ " ('Precision', 0.9747899159663865),\n", " ('Recall', 0.9747899159663865),\n", " ('AUC', 0.9886377086045686),\n", - " ('Log Loss', 0.1917051028291318),\n", + " ('Log Loss', 0.19170510282820305),\n", " ('MCC', 0.932536394839626),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -841,7 +841,7 @@ " ('Precision', 0.952),\n", " ('Recall', 0.9754098360655737),\n", " ('AUC', 0.9850869925434962),\n", - " ('Log Loss', 0.15159254810086167),\n", + " ('Log Loss', 0.15159254810085362),\n", " ('MCC', 0.933568045604951),\n", " ('# Training', 379),\n", " ('# Testing', 190)]),\n", @@ -849,11 +849,11 @@ " ('Precision', 0.991304347826087),\n", " ('Recall', 0.9743589743589743),\n", " ('AUC', 0.990516206482593),\n", - " ('Log Loss', 0.1156693063457744),\n", + " ('Log Loss', 0.11566930634571038),\n", " ('MCC', 0.9336637889421326),\n", " ('# Training', 380),\n", " ('# Testing', 189)])],\n", - " 'training_time': 7.72037410736084}}" + " 'training_time': 7.280526161193848}}" ] }, "execution_count": 7, diff --git a/docs/source/demos/fraud.ipynb b/docs/source/demos/fraud.ipynb index 814e65b6e7..9f5522bd91 100644 --- a/docs/source/demos/fraud.ipynb +++ b/docs/source/demos/fraud.ipynb @@ -147,26 +147,14 @@ "Optimizing for Fraud Cost. Lower score is better.\n", "\n", "Searching up to 5 pipelines. \n", - "Possible model types: random_forest, xgboost, linear_model\n", + "Possible model types: linear_model, random_forest, xgboost\n", "\n", - "▹ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount, lat, lng\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount, lat, lng\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields card_id, store_id, amount, lat\n", - "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:12\n", - "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:39\n", - "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:02:38\n", - "▹ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:38DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields amount, lng\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount\n", - "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:50\n", - "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:03:16\n", - "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:03:16\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:24\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:51\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:02:49\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:03:15\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:03:47\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:03:47\n", "\n", "✔ Optimization finished\n" ] @@ -233,6 +221,14 @@ " \n", " \n", " 1\n", + " 0\n", + " XGBoostPipeline\n", + " 0.007623\n", + " False\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " \n", + " \n", + " 2\n", " 4\n", " LogisticRegressionPipeline\n", " 0.007623\n", @@ -240,7 +236,7 @@ " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", - " 2\n", + " 3\n", " 2\n", " RFClassificationPipeline\n", " 0.007623\n", @@ -248,18 +244,10 @@ " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", - " 3\n", - " 0\n", - " XGBoostPipeline\n", - " NaN\n", - " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", - " \n", - " \n", " 4\n", " 3\n", " XGBoostPipeline\n", - " NaN\n", + " 0.007623\n", " False\n", " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", @@ -270,16 +258,16 @@ "text/plain": [ " id pipeline_name score high_variance_cv \\\n", "0 1 XGBoostPipeline 0.007623 False \n", - "1 4 LogisticRegressionPipeline 0.007623 False \n", - "2 2 RFClassificationPipeline 0.007623 False \n", - "3 0 XGBoostPipeline NaN False \n", - "4 3 XGBoostPipeline NaN False \n", + "1 0 XGBoostPipeline 0.007623 False \n", + "2 4 LogisticRegressionPipeline 0.007623 False \n", + "3 2 RFClassificationPipeline 0.007623 False \n", + "4 3 XGBoostPipeline 0.007623 False \n", "\n", " parameters \n", "0 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "1 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "2 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "2 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "3 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", "4 {'eta': 0.5288949197529046, 'min_child_weight'... " ] }, @@ -333,10 +321,10 @@ "Problem Types: Binary Classification, Multiclass Classification\n", "Model Type: XGBoost Classifier\n", "Objective to Optimize: Fraud Cost (lower is better)\n", - "\n", + "Number of features: 5\n", "\n", "Pipeline Steps\n", - "===============\n", + "==============\n", "1. One Hot Encoder\n", "2. Simple Imputer\n", "\t * impute_strategy : median\n", @@ -351,7 +339,7 @@ "Training\n", "========\n", "Training for Binary Classification problems.\n", - "Total training time (including CV): 26.4 seconds\n", + "Total training time (including CV): 27.5 seconds\n", "\n", "Cross Validation\n", "----------------\n", @@ -386,7 +374,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -452,26 +440,14 @@ "Optimizing for AUC. Greater score is better.\n", "\n", "Searching up to 5 pipelines. \n", - "Possible model types: random_forest, xgboost, linear_model\n", + "Possible model types: linear_model, random_forest, xgboost\n", "\n", - "▹ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields card_id, store_id, amount, lat\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields card_id, store_id, amount, lat\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields card_id, store_id, amount, lng\n", - "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:14\n", - "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:41\n", - "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:02:59\n", - "▹ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:02:59DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields store_id, amount\n", - "DataFrame.dtypes for data must be int, float or bool.\n", - " Did not expect the data types in fields amount, lat\n", - "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:03:14\n", - "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:03:40\n", - "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:03:40\n", + "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:28\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:58\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:03:30\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:04:00\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:04:34\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:04:34\n", "\n", "✔ Optimization finished\n" ] @@ -544,28 +520,28 @@ " \n", " \n", " 2\n", - " 4\n", - " LogisticRegressionPipeline\n", - " 0.831181\n", - " False\n", - " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", - " \n", - " \n", - " 3\n", " 0\n", " XGBoostPipeline\n", - " NaN\n", + " 0.852527\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", - " 4\n", + " 3\n", " 3\n", " XGBoostPipeline\n", - " NaN\n", + " 0.847393\n", " False\n", " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", + " \n", + " 4\n", + " 4\n", + " LogisticRegressionPipeline\n", + " 0.831181\n", + " False\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", + " \n", " \n", "\n", "" @@ -574,16 +550,16 @@ " id pipeline_name score high_variance_cv \\\n", "0 2 RFClassificationPipeline 0.873053 False \n", "1 1 XGBoostPipeline 0.867186 False \n", - "2 4 LogisticRegressionPipeline 0.831181 False \n", - "3 0 XGBoostPipeline NaN False \n", - "4 3 XGBoostPipeline NaN False \n", + "2 0 XGBoostPipeline 0.852527 False \n", + "3 3 XGBoostPipeline 0.847393 False \n", + "4 4 LogisticRegressionPipeline 0.831181 False \n", "\n", " parameters \n", "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", "1 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "2 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "4 {'eta': 0.5288949197529046, 'min_child_weight'... " + "2 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "4 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... " ] }, "execution_count": 12, @@ -603,7 +579,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 13, diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb index c947e9ddba..39a8cd3066 100644 --- a/docs/source/index.ipynb +++ b/docs/source/index.ipynb @@ -111,14 +111,14 @@ "Optimizing for F1. Greater score is better.\n", "\n", "Searching up to 5 pipelines. \n", - "Possible model types: xgboost, linear_model, random_forest\n", + "Possible model types: random_forest, linear_model, xgboost\n", "\n", "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n", - "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:01\n", - "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:00:12\n", - "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:00:12\n", - "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:22\n", - "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:22\n", + "✔ XGBoost Classifier w/ One Hot Encod... 20%|██ | Elapsed:00:00\n", + "✔ Random Forest Classifier w/ One Hot... 40%|████ | Elapsed:00:05\n", + "✔ XGBoost Classifier w/ One Hot Encod... 60%|██████ | Elapsed:00:06\n", + "✔ Logistic Regression Classifier w/ O... 80%|████████ | Elapsed:00:13\n", + "✔ Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:13\n", "\n", "✔ Optimization finished\n" ] @@ -265,10 +265,10 @@ "Problem Types: Binary Classification, Multiclass Classification\n", "Model Type: XGBoost Classifier\n", "Objective to Optimize: F1 (greater is better)\n", - "\n", + "Number of features: 10\n", "\n", "Pipeline Steps\n", - "===============\n", + "==============\n", "1. One Hot Encoder\n", "2. Simple Imputer\n", "\t * impute_strategy : most_frequent\n", diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 39e23e9817..88972d8689 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -20,21 +20,15 @@ def __init__(self, impute_strategy="most_frequent"): random_state=0) def transform(self, X): - try: - X_t = self._component_obj.transform(X) - if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - # skLearn's SimpleImputer loses track of column type, so we need to restore - X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) - return X_t - except AttributeError: - raise RuntimeError("Transformer requires a transform method or a component_obj that implements transform") + X_t = self._component_obj.transform(X) + if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): + # skLearn's SimpleImputer loses track of column type, so we need to restore + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) + return X_t def fit_transform(self, X, y=None): - try: - X_t = self._component_obj.fit_transform(X, y) - if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): - # skLearn's SimpleImputer loses track of column type, so we need to restore - X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) - return X_t - except AttributeError: - raise RuntimeError("Transformer requires a fit_transform method or a component_obj that implements fit_transform") + X_t = self._component_obj.fit_transform(X, y) + if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame): + # skLearn's SimpleImputer loses track of column type, so we need to restore + X_t = pd.DataFrame(X_t, columns=X.columns, index=X.index).astype(X.dtypes.to_dict()) + return X_t diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 8fbf83004f..5475502787 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -95,6 +95,9 @@ def describe(self, return_dict=False): objective_string = "Objective to Optimize: {} ({})".format(self.objective.name, better_string) self.logger.log(objective_string) + if self.estimator.name in self.input_feature_names: + self.logger.log("Number of features: {}".format(len(self.input_feature_names[self.estimator.name]))) + # Summary of steps self.logger.log_subtitle("Pipeline Steps") for number, component in enumerate(self.component_list, 1): From dffa463e03689255eba387864428bc6e0ab96f31 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 12 Nov 2019 18:25:40 -0500 Subject: [PATCH 17/17] deleting unused notebook --- docs/Fraud Prediction Demo.ipynb | 654 ------------------------------- 1 file changed, 654 deletions(-) delete mode 100644 docs/Fraud Prediction Demo.ipynb diff --git a/docs/Fraud Prediction Demo.ipynb b/docs/Fraud Prediction Demo.ipynb deleted file mode 100644 index 8585f344b2..0000000000 --- a/docs/Fraud Prediction Demo.ipynb +++ /dev/null @@ -1,654 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Building a Fraud Prediction Model with EvalML\n", - "\n", - "In this demo, we will build an optimized fraud prediction model using EvalML. To optimize the model we will set up an objective function based on some assumptions about our business. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import evalml\n", - "from evalml.objectives import FraudCost\n", - "from evalml.preprocessing import split_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Transactions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Number of Features\n", - "Boolean 1\n", - "Categorical 6\n", - "Numeric 5\n", - "\n", - "Number of training examples: 99992\n", - "\n", - "Labels\n", - "False 84.82%\n", - "True 15.18%\n", - "Name: fraud, dtype: object\n" - ] - } - ], - "source": [ - "X, y = evalml.demos.load_fraud()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Objective" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "fraud_objective = FraudCost(\n", - " retry_percentage=.5,\n", - " interchange_fee=.02,\n", - " fraud_payout_percentage=.75,\n", - " amount_col='amount'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search for best modeling pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to validate the results of the pipeline creation and optimization process, we will save some of our data as a holdout set" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']\n", - "X = X.select_dtypes(include=numerics)\n", - "X_train, X_holdout, y_train, y_holdout = split_data(X, y, test_size=.8, random_state=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because the fraud labels are binary, we will use `AutoClassifier`. When we call `.fit()`, the search for the best pipeline will begin. " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1m*****************************\u001b[0m\n", - "\u001b[1m* Beginning pipeline search *\u001b[0m\n", - "\u001b[1m*****************************\u001b[0m\n", - "\n", - "Optimizing for Fraud Cost. Lower score is better.\n", - "\n", - "Searching up to 20 pipelines. \n", - "Will stop searching for new pipelines after 120 seconds.\n", - "\n", - "Possible model types: random_forest, linear_model, xgboost\n", - "\n", - "✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:02\n", - "▹ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:02" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n", - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ XGBoost Classifier w/ One Hot Encod... 5%|▌ | Elapsed:00:05\n", - "▹ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:05" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Random Forest Classifier w/ One Hot... 10%|█ | Elapsed:00:39\n", - "▹ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:39" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ XGBoost Classifier w/ One Hot Encod... 15%|█▌ | Elapsed:00:41\n", - "▹ Logistic Regression Classifier w/ O... 20%|██ | Elapsed:00:41" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Logistic Regression Classifier w/ O... 20%|██ | Elapsed:00:49\n", - "▹ XGBoost Classifier w/ One Hot Encod... 25%|██▌ | Elapsed:00:49" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ XGBoost Classifier w/ One Hot Encod... 25%|██▌ | Elapsed:00:52\n", - "▹ Logistic Regression Classifier w/ O... 30%|███ | Elapsed:00:52" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Logistic Regression Classifier w/ O... 30%|███ | Elapsed:01:00\n", - "✔ XGBoost Classifier w/ One Hot Encod... 35%|███▌ | Elapsed:01:06\n", - "▹ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:01:06" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Logistic Regression Classifier w/ O... 40%|████ | Elapsed:01:14\n", - "✔ Logistic Regression Classifier w/ O... 45%|████▌ | Elapsed:01:23\n", - "✔ XGBoost Classifier w/ One Hot Encod... 50%|█████ | Elapsed:01:24\n", - "✔ Logistic Regression Classifier w/ O... 55%|█████▌ | Elapsed:01:32\n", - "▹ Random Forest Classifier w/ One Hot... 60%|██████ | Elapsed:01:32" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Random Forest Classifier w/ One Hot... 60%|██████ | Elapsed:01:47\n", - "▹ Logistic Regression Classifier w/ O... 65%|██████▌ | Elapsed:01:47" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Logistic Regression Classifier w/ O... 65%|██████▌ | Elapsed:01:56\n", - "✔ XGBoost Classifier w/ One Hot Encod... 70%|███████ | Elapsed:01:57\n", - "▹ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:01:57" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/angela.lin/Desktop/env/lib/python3.7/site-packages/sklearn/metrics/classification.py:872: RuntimeWarning: invalid value encountered in double_scalars\n", - " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✔ Random Forest Classifier w/ One Hot... 75%|███████▌ | Elapsed:02:19\n", - "✔ Random Forest Classifier w/ One Hot... 80%|████████ | Elapsed:02:19\n", - "\n", - "\n", - "Max time elapsed. Stopping search early.\n", - "\n", - "✔ Optimization finished\n" - ] - } - ], - "source": [ - "clf = evalml.AutoClassifier(objective=fraud_objective,\n", - " max_time=120,\n", - " max_pipelines=20)\n", - "\n", - "clf.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View rankings and select pipeline\n", - "\n", - "Once the fitting process is done, we can see all of the pipelines that were searched, ranked by their score on the objective function we defined" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idpipeline_namescorehigh_variance_cvparameters
014XGBoostPipeline0.007614False{'eta': 0.264555612104627, 'min_child_weight':...
112RFClassificationPipeline0.007614False{'n_estimators': 369, 'max_depth': 10, 'impute...
22RFClassificationPipeline0.007615False{'n_estimators': 569, 'max_depth': 22, 'impute...
30XGBoostPipeline0.007615False{'eta': 0.5928446182250184, 'min_child_weight'...
43XGBoostPipeline0.007615False{'eta': 0.5288949197529046, 'min_child_weight'...
55XGBoostPipeline0.007615False{'eta': 0.6481718720511973, 'min_child_weight'...
610XGBoostPipeline0.007615False{'eta': 0.7206326547259169, 'min_child_weight'...
77XGBoostPipeline0.007615False{'eta': 0.9786183422327642, 'min_child_weight'...
81XGBoostPipeline0.007616False{'eta': 0.38438170729269994, 'min_child_weight...
915RFClassificationPipeline0.007616False{'n_estimators': 609, 'max_depth': 7, 'impute_...
108LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 0.5765626434012575, 'im...
114LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 8.444214828324364, 'imp...
126LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 6.239401330891865, 'imp...
139LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 8.123565600467177, 'imp...
1411LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 8.362426847738403, 'imp...
1513LogisticRegressionPipeline0.007616False{'penalty': 'l2', 'C': 3.6887329830070748, 'im...
\n", - "
" - ], - "text/plain": [ - " id pipeline_name score high_variance_cv \\\n", - "0 14 XGBoostPipeline 0.007614 False \n", - "1 12 RFClassificationPipeline 0.007614 False \n", - "2 2 RFClassificationPipeline 0.007615 False \n", - "3 0 XGBoostPipeline 0.007615 False \n", - "4 3 XGBoostPipeline 0.007615 False \n", - "5 5 XGBoostPipeline 0.007615 False \n", - "6 10 XGBoostPipeline 0.007615 False \n", - "7 7 XGBoostPipeline 0.007615 False \n", - "8 1 XGBoostPipeline 0.007616 False \n", - "9 15 RFClassificationPipeline 0.007616 False \n", - "10 8 LogisticRegressionPipeline 0.007616 False \n", - "11 4 LogisticRegressionPipeline 0.007616 False \n", - "12 6 LogisticRegressionPipeline 0.007616 False \n", - "13 9 LogisticRegressionPipeline 0.007616 False \n", - "14 11 LogisticRegressionPipeline 0.007616 False \n", - "15 13 LogisticRegressionPipeline 0.007616 False \n", - "\n", - " parameters \n", - "0 {'eta': 0.264555612104627, 'min_child_weight':... \n", - "1 {'n_estimators': 369, 'max_depth': 10, 'impute... \n", - "2 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "3 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "4 {'eta': 0.5288949197529046, 'min_child_weight'... \n", - "5 {'eta': 0.6481718720511973, 'min_child_weight'... \n", - "6 {'eta': 0.7206326547259169, 'min_child_weight'... \n", - "7 {'eta': 0.9786183422327642, 'min_child_weight'... \n", - "8 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "9 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", - "10 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... \n", - "11 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "12 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", - "13 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", - "14 {'penalty': 'l2', 'C': 8.362426847738403, 'imp... \n", - "15 {'penalty': 'l2', 'C': 3.6887329830070748, 'im... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf.rankings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "to select the best pipeline we can run" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = clf.best_pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "to select another pipeline we can use the id" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = clf.get_pipeline(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Describe pipeline\n", - "\n", - "You can get more details about any pipeline. Including " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1m********************************************************************************************\u001b[0m\n", - "\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n", - "\u001b[1m********************************************************************************************\u001b[0m\n", - "\n", - "Problem Types: Binary Classification, Multiclass Classification\n", - "Model Type: XGBoost Classifier\n", - "Objective to Optimize: Fraud Cost (lower is better)\n", - "Number of features: 3\n", - "\n", - "Pipeline Steps\n", - "==============\n", - "1. One Hot Encoder\n", - "2. Simple Imputer\n", - "\t * impute_strategy : most_frequent\n", - "3. RF Classifier Select From Model\n", - "\t * percent_features : 0.6273280598181127\n", - "\t * threshold : -inf\n", - "4. XGBoost Classifier\n", - "\t * eta : 0.5928446182250184\n", - "\t * max_depth : 4\n", - "\t * min_child_weight : 8.598391737229157\n", - "\n", - "Training\n", - "========\n", - "Training for Binary Classification problems.\n", - "Total training time (including CV): 2.0 seconds\n", - "\n", - "Cross Validation\n", - "----------------\n", - " Fraud Cost F1 Precision Recall AUC Log Loss MCC # Training # Testing\n", - "0 0.008 0.263 0.152 0.263 0.855 0.188 -0.007 13332.000 6666.000\n", - "1 0.008 0.263 0.152 0.263 0.844 0.190 -0.000 13332.000 6666.000\n", - "2 0.008 0.264 0.152 0.264 0.832 0.200 0.000 13332.000 6666.000\n", - "mean 0.008 0.264 0.152 0.264 0.844 0.192 -0.002 - -\n", - "std 0.000 0.000 0.000 0.000 0.012 0.006 0.004 - -\n", - "coef of var 0.002 0.000 0.000 0.000 0.014 0.033 -1.603 - -\n" - ] - } - ], - "source": [ - "clf.describe_pipeline(0)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}