Skip to content

Commit

Permalink
Improve describe_pipeline by using pipeline's new describe method (#190)
Browse files Browse the repository at this point in the history
* moving describe things

* changelog~

* linting

* updating via comments

* remove dict, add to enum

* reusing dict in str fcn

* addressing comments on describe

* updating notebooks

* removing newline

* updating to fix warning

* moving typing from transformer to simpleimputer

* linting

* lint

* remove dstr

* linting

* adding num features, removing try/except, refreshing notebooks

* deleting unused notebook
  • Loading branch information
angela97lin committed Nov 13, 2019
1 parent 247a546 commit 18ffb63
Show file tree
Hide file tree
Showing 16 changed files with 517 additions and 958 deletions.
519 changes: 0 additions & 519 deletions docs/Fraud Prediction Demo.ipynb

This file was deleted.

83 changes: 45 additions & 38 deletions docs/source/automl/guardrails.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@
"\n",
"Optimizing for Precision. Greater score is better.\n",
"\n",
"Searching up to 1 pipelines. No time limit is set. Set one using max_time parameter.\n",
"\n",
"Searching up to 1 pipelines. \n",
"Possible model types: linear_model\n",
"\n",
"WARNING: Possible label leakage: leaked_feature, leaked_feature_2\n",
"LogisticRegression w/ imputation + scaling: 0%| | Elapsed:00:03\n",
"LogisticRegression w/ imputation + scaling: 100%|██████████| Elapsed:00:03\n",
"Logistic Regression Classifier w/ O... 0%| | Elapsed:00:07\n",
"Logistic Regression Classifier w/ O... 100%|██████████| Elapsed:00:07\n",
"\n",
"✔ Optimization finished\n"
]
Expand Down Expand Up @@ -182,12 +181,11 @@
"\n",
"Optimizing for Precision. Greater score is better.\n",
"\n",
"Searching up to 1 pipelines. No time limit is set. Set one using max_time parameter.\n",
"\n",
"Searching up to 1 pipelines. \n",
"Possible model types: linear_model, random_forest, xgboost\n",
"\n",
"✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n",
"✔ XGBoost w/ imputation: 100%|██████████| Elapsed:00:00\n",
"✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n",
"✔ XGBoost Classifier w/ One Hot Encod... 100%|██████████| Elapsed:00:00\n",
"\n",
"✔ Optimization finished\n"
]
Expand Down Expand Up @@ -222,35 +220,45 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m************************\u001b[0m\n",
"\u001b[1m* Pipeline Description *\u001b[0m\n",
"\u001b[1m************************\u001b[0m\n",
"\u001b[1m********************************************************************************************\u001b[0m\n",
"\u001b[1m* XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model *\u001b[0m\n",
"\u001b[1m********************************************************************************************\u001b[0m\n",
"\n",
"Pipeline Name: XGBoost w/ imputation\n",
"Model type: ModelTypes.XGBOOST\n",
"Objective: Precision (greater is better)\n",
"Total training time (including CV): 0.4 seconds\n",
"Problem Types: Binary Classification, Multiclass Classification\n",
"Model Type: XGBoost Classifier\n",
"Objective to Optimize: Precision (greater is better)\n",
"Number of features: 18\n",
"\n",
"Parameters\n",
"==========\n",
"• eta: 0.5928446182250184\n",
"• min_child_weight: 8.598391737229157\n",
"• max_depth: 4\n",
"• impute_strategy: most_frequent\n",
"• percent_features: 0.6273280598181127\n",
"Pipeline Steps\n",
"==============\n",
"1. One Hot Encoder\n",
"2. Simple Imputer\n",
"\t * impute_strategy : most_frequent\n",
"3. RF Classifier Select From Model\n",
"\t * percent_features : 0.6273280598181127\n",
"\t * threshold : -inf\n",
"4. XGBoost Classifier\n",
"\t * eta : 0.5928446182250184\n",
"\t * max_depth : 4\n",
"\t * min_child_weight : 8.598391737229157\n",
"\n",
"Training\n",
"========\n",
"Training for Binary Classification problems.\n",
"Total training time (including CV): 0.5 seconds\n",
"\n",
"Cross Validation\n",
"=================\n",
"----------------\n",
" Precision F1 Recall AUC Log Loss MCC # Training # Testing\n",
"0 0.974 0.822 0.822 0.950 0.578 0.650 83.000 81.000\n",
"1 1.000 0.988 0.988 1.000 0.163 0.976 164.000 81.000\n",
"2 0.964 0.972 0.972 0.968 0.134 0.916 245.000 81.000\n",
"3 1.000 0.955 0.955 0.997 0.106 0.866 326.000 81.000\n",
"4 1.000 0.968 0.968 0.998 0.116 0.871 407.000 81.000\n",
"5 0.983 0.983 0.983 0.998 0.077 0.936 488.000 81.000\n",
"mean 0.987 0.948 0.948 0.985 0.196 0.869 - -\n",
"std 0.016 0.063 0.063 0.021 0.190 0.115 - -\n",
"coef of var 0.016 0.066 0.066 0.021 0.969 0.132 - -\n"
"2 0.981 0.981 0.981 0.968 0.139 0.944 245.000 81.000\n",
"3 0.963 0.929 0.929 0.991 0.113 0.774 326.000 81.000\n",
"4 0.984 0.960 0.960 0.993 0.147 0.830 407.000 81.000\n",
"5 0.983 0.983 0.983 0.998 0.083 0.936 488.000 81.000\n",
"mean 0.981 0.944 0.944 0.983 0.204 0.852 - -\n",
"std 0.012 0.064 0.064 0.020 0.186 0.125 - -\n",
"coef of var 0.013 0.067 0.067 0.020 0.909 0.147 - -\n"
]
}
],
Expand Down Expand Up @@ -309,7 +317,7 @@
" <td>0</td>\n",
" <td>0</td>\n",
" <td>XGBoostPipeline</td>\n",
" <td>0.986776</td>\n",
" <td>0.980845</td>\n",
" <td>False</td>\n",
" <td>{'eta': 0.5928446182250184, 'min_child_weight'...</td>\n",
" </tr>\n",
Expand All @@ -319,7 +327,7 @@
],
"text/plain": [
" id pipeline_name score high_variance_cv \\\n",
"0 0 XGBoostPipeline 0.986776 False \n",
"0 0 XGBoostPipeline 0.980845 False \n",
"\n",
" parameters \n",
"0 {'eta': 0.5928446182250184, 'min_child_weight'... "
Expand Down Expand Up @@ -370,14 +378,13 @@
"\n",
"Optimizing for Recall. Greater score is better.\n",
"\n",
"Searching up to 3 pipelines. No time limit is set. Set one using max_time parameter.\n",
"\n",
"Searching up to 3 pipelines. \n",
"Possible model types: linear_model, random_forest, xgboost\n",
"\n",
"✔ XGBoost w/ imputation: 0%| | Elapsed:00:00\n",
"✔ XGBoost w/ imputation: 33%|███▎ | Elapsed:00:00\n",
"✔ Random Forest w/ imputation: 67%|██████▋ | Elapsed:00:06\n",
"✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:06\n",
"✔ XGBoost Classifier w/ One Hot Encod... 0%| | Elapsed:00:00\n",
"✔ XGBoost Classifier w/ One Hot Encod... 33%|███▎ | Elapsed:00:00\n",
"✔ Random Forest Classifier w/ One Hot... 67%|██████▋ | Elapsed:00:06\n",
"✔ Random Forest Classifier w/ One Hot... 100%|██████████| Elapsed:00:06\n",
"\n",
"✔ Optimization finished\n"
]
Expand Down
107 changes: 58 additions & 49 deletions docs/source/automl/regression_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@
"\n",
"Optimizing for R2. Greater score is better.\n",
"\n",
"Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n",
"Searching up to 5 pipelines. \n",
"Possible model types: linear_model, random_forest\n",
"\n",
"Possible model types: random_forest\n",
"\n",
"✔ Random Forest w/ imputation: 0%| | Elapsed:00:05\n",
"✔ Random Forest w/ imputation: 20%|██ | Elapsed:00:10\n",
"✔ Random Forest w/ imputation: 40%|████ | Elapsed:00:16\n",
"✔ Random Forest w/ imputation: 60%|██████ | Elapsed:00:22\n",
"✔ Random Forest w/ imputation: 80%|████████ | Elapsed:00:30\n",
"✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:30\n",
"✔ Random Forest Regressor w/ One Hot ... 0%| | Elapsed:00:05\n",
"✔ Random Forest Regressor w/ One Hot ... 20%|██ | Elapsed:00:09\n",
"✔ Linear Regressor w/ One Hot Encoder... 40%|████ | Elapsed:00:09\n",
"✔ Random Forest Regressor w/ One Hot ... 40%|████ | Elapsed:00:15\n",
"✔ Random Forest Regressor w/ One Hot ... 80%|████████ | Elapsed:00:21\n",
"✔ Random Forest Regressor w/ One Hot ... 100%|██████████| Elapsed:00:21\n",
"\n",
"✔ Optimization finished\n"
]
Expand Down Expand Up @@ -86,31 +85,31 @@
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>RFRegressionPipeline</td>\n",
" <td>0.422322</td>\n",
" <td>2</td>\n",
" <td>LinearRegressionPipeline</td>\n",
" <td>0.488703</td>\n",
" <td>False</td>\n",
" <td>{'n_estimators': 569, 'max_depth': 22, 'impute...</td>\n",
" <td>{'impute_strategy': 'mean', 'normalize': True,...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>RFRegressionPipeline</td>\n",
" <td>0.417416</td>\n",
" <td>0.422322</td>\n",
" <td>False</td>\n",
" <td>{'n_estimators': 859, 'max_depth': 6, 'impute_...</td>\n",
" <td>{'n_estimators': 569, 'max_depth': 22, 'impute...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>RFRegressionPipeline</td>\n",
" <td>0.391463</td>\n",
" <td>False</td>\n",
" <td>{'n_estimators': 715, 'max_depth': 7, 'impute_...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>RFRegressionPipeline</td>\n",
" <td>0.383134</td>\n",
" <td>False</td>\n",
Expand All @@ -129,16 +128,16 @@
"</div>"
],
"text/plain": [
" id pipeline_name score high_variance_cv \\\n",
"0 0 RFRegressionPipeline 0.422322 False \n",
"1 4 RFRegressionPipeline 0.417416 False \n",
"2 3 RFRegressionPipeline 0.391463 False \n",
"3 2 RFRegressionPipeline 0.383134 False \n",
"4 1 RFRegressionPipeline 0.381204 False \n",
" id pipeline_name score high_variance_cv \\\n",
"0 2 LinearRegressionPipeline 0.488703 False \n",
"1 0 RFRegressionPipeline 0.422322 False \n",
"2 4 RFRegressionPipeline 0.391463 False \n",
"3 3 RFRegressionPipeline 0.383134 False \n",
"4 1 RFRegressionPipeline 0.381204 False \n",
"\n",
" parameters \n",
"0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n",
"1 {'n_estimators': 859, 'max_depth': 6, 'impute_... \n",
"0 {'impute_strategy': 'mean', 'normalize': True,... \n",
"1 {'n_estimators': 569, 'max_depth': 22, 'impute... \n",
"2 {'n_estimators': 715, 'max_depth': 7, 'impute_... \n",
"3 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n",
"4 {'n_estimators': 369, 'max_depth': 10, 'impute... "
Expand All @@ -161,7 +160,7 @@
{
"data": {
"text/plain": [
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
"<evalml.pipelines.regression.linear_regression.LinearRegressionPipeline at 0x1308f16d0>"
]
},
"execution_count": 3,
Expand All @@ -181,7 +180,7 @@
{
"data": {
"text/plain": [
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x12d737610>"
]
},
"execution_count": 4,
Expand All @@ -202,31 +201,41 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m************************\u001b[0m\n",
"\u001b[1m* Pipeline Description *\u001b[0m\n",
"\u001b[1m************************\u001b[0m\n",
"\u001b[1m************************************************************************************************\u001b[0m\n",
"\u001b[1m* Random Forest Regressor w/ One Hot Encoder + Simple Imputer + RF Regressor Select From Model *\u001b[0m\n",
"\u001b[1m************************************************************************************************\u001b[0m\n",
"\n",
"Problem Types: Regression\n",
"Model Type: Random Forest\n",
"Objective to Optimize: R2 (greater is better)\n",
"Number of features: 8\n",
"\n",
"Pipeline Name: Random Forest w/ imputation\n",
"Model type: ModelTypes.RANDOM_FOREST\n",
"Objective: R2 (greater is better)\n",
"Total training time (including CV): 5.8 seconds\n",
"Pipeline Steps\n",
"==============\n",
"1. One Hot Encoder\n",
"2. Simple Imputer\n",
"\t * impute_strategy : most_frequent\n",
"3. RF Regressor Select From Model\n",
"\t * percent_features : 0.8593661614465293\n",
"\t * threshold : -inf\n",
"4. Random Forest Regressor\n",
"\t * n_estimators : 569\n",
"\t * max_depth : 22\n",
"\n",
"Parameters\n",
"==========\n",
"• n_estimators: 569\n",
"• max_depth: 22\n",
"• impute_strategy: most_frequent\n",
"• percent_features: 0.8593661614465293\n",
"Training\n",
"========\n",
"Training for Regression problems.\n",
"Total training time (including CV): 5.6 seconds\n",
"\n",
"Cross Validation\n",
"=================\n",
" R2 # Training # Testing\n",
"0 0.427 294.000 148.000\n",
"1 0.450 295.000 147.000\n",
"2 0.390 295.000 147.000\n",
"mean 0.422 - -\n",
"std 0.031 - -\n",
"coef of var 0.072 - -\n"
"----------------\n",
" R2 MAE MSE MSLE MedianAE MaxError ExpVariance # Training # Testing\n",
"0 0.427 46.033 3276.018 0.194 39.699 161.858 0.428 294.000 148.000\n",
"1 0.450 48.953 3487.566 0.193 44.344 160.513 0.451 295.000 147.000\n",
"2 0.390 47.401 3477.117 0.193 41.297 171.420 0.390 295.000 147.000\n",
"mean 0.422 47.462 3413.567 0.193 41.780 164.597 0.423 - -\n",
"std 0.031 1.461 119.235 0.000 2.360 5.947 0.031 - -\n",
"coef of var 0.072 0.031 0.035 0.002 0.056 0.036 0.073 - -\n"
]
}
],
Expand Down
Loading

0 comments on commit 18ffb63

Please sign in to comment.