From 433a0128af3f810f032e8b7a48f9abf31cab0185 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 12:49:17 -0400 Subject: [PATCH 01/18] Added categorical for regression --- evalml/pipelines/regression/random_forest.py | 11 ++++++++--- evalml/tests/test_autoregressor.py | 7 +++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index ad93a8756c..c91c8493eb 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -1,9 +1,11 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.feature_selection import SelectFromModel from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder from skopt.space import Integer, Real from evalml.pipelines import PipelineBase @@ -20,13 +22,15 @@ class RFRegressionPipeline(PipelineBase): "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 32), "impute_strategy": ["mean", "median", "most_frequent"], + "drop_invariant": [True, False], "percent_features": Real(.01, 1) } - def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features, + def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) estimator = RandomForestRegressor(random_state=random_state, n_estimators=n_estimators, @@ -40,9 +44,10 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), - ("estimator", estimator)] + ("estimator", estimator)], ) super().__init__(objective=objective, random_state=random_state) diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index c1d86b5d75..50a6d344de 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -53,11 +53,10 @@ def test_random_state(X_y): def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression - clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) + clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0, model_types=["random_forest"]) error_msg = 'contains non-numerical data' - with pytest.raises(ValueError, match=error_msg): - clf.fit(X, y, raise_errors=True) - + clf.fit(X.values, y, raise_errors=True) + assert clf.rankings['score'].isnull().any() == False def test_callback(X_y): X, y = X_y From 7121f63f42dfdd5f1f883f0746a3f30ab02dea3f Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 14:51:38 -0400 Subject: [PATCH 02/18] Added categorical for classificaiton --- .../pipelines/classification/logistic_regression.py | 8 ++++++-- evalml/pipelines/classification/random_forest.py | 12 ++++++++---- evalml/pipelines/classification/xgboost.py | 10 +++++++--- evalml/tests/conftest.py | 8 ++++++++ evalml/tests/test_autoclassifier.py | 8 ++++++++ evalml/tests/test_autoregressor.py | 2 +- 6 files changed, 38 insertions(+), 10 deletions(-) diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py index e064a33030..4e33ab9ca6 100644 --- a/evalml/pipelines/classification/logistic_regression.py +++ b/evalml/pipelines/classification/logistic_regression.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.impute import SimpleImputer @@ -20,11 +21,13 @@ class LogisticRegressionPipeline(PipelineBase): "penalty": ["l2"], "C": Real(.01, 10), "impute_strategy": ["mean", "median", "most_frequent"], + "drop_invariant": [True, False] } - def __init__(self, objective, penalty, C, impute_strategy, + def __init__(self, objective, penalty, C, impute_strategy, drop_invariant, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) estimator = LogisticRegression(random_state=random_state, penalty=penalty, @@ -34,7 +37,8 @@ def __init__(self, objective, penalty, C, impute_strategy, n_jobs=-1) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("scaler", StandardScaler()), ("estimator", estimator)] ) diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index 06ea8cb1b1..7f707b3475 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier @@ -20,12 +21,14 @@ class RFClassificationPipeline(PipelineBase): "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 32), "impute_strategy": ["mean", "median", "most_frequent"], - "percent_features": Real(.01, 1) + "percent_features": Real(.01, 1), + "drop_invariant": [True, False] } - def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features, - number_features, n_jobs=1, random_state=0): + def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, + percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) estimator = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators, @@ -39,7 +42,8 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), ("estimator", estimator)] ) diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index ae2bc63afa..b15e6659a3 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.feature_selection import SelectFromModel @@ -21,12 +22,14 @@ class XGBoostPipeline(PipelineBase): "min_child_weight": Real(1, 10), "max_depth": Integer(1, 20), "impute_strategy": ["mean", "median", "most_frequent"], + "drop_invariant": [True, False], "percent_features": Real(.01, 1) } - def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, percent_features, - number_features, n_jobs=1, random_state=0): + def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, drop_invariant, + percent_features,number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) estimator = XGBClassifier( random_state=random_state, @@ -42,7 +45,8 @@ def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), ("estimator", estimator)] ) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 84eacec411..0782b960d4 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -28,6 +28,14 @@ def X_y_categorical_regression(): return X, y +@pytest.fixture +def X_y_categorical_classification(): + titanic = pd.read_csv('https://featuretools-static.s3.amazonaws.com/evalml/Titanic/train.csv') + y = titanic['Survived'] + X = titanic.drop('Survived', axis=1) + return X, y + + @pytest.fixture def trained_model(X_y): X, y = X_y diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index 5f5b06ae0a..540f735171 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -119,6 +119,14 @@ def test_multi_auto(X_y_multi): assert clf.default_objectives == get_objectives('multiclass') +def test_categorical_auto(X_y_categorical_classification): + X, y = X_y_categorical_classification + clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) + clf.fit(X.values, y, raise_errors=True) + print(clf.rankings) + assert clf.rankings['score'].isnull().any() == False + + def test_random_state(X_y): X, y = X_y diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 50a6d344de..4b92f75a6e 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -54,10 +54,10 @@ def test_random_state(X_y): def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0, model_types=["random_forest"]) - error_msg = 'contains non-numerical data' clf.fit(X.values, y, raise_errors=True) assert clf.rankings['score'].isnull().any() == False + def test_callback(X_y): X, y = X_y From 175ef93842bd7947dab7799b3b9d91d39437e8b6 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 14:52:25 -0400 Subject: [PATCH 03/18] Remove error --- evalml/models/auto_base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 6cacb3dcd6..91b41b6003 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd from colorama import Style -from pandas.api.types import is_numeric_dtype from tqdm import tqdm from evalml import preprocessing @@ -97,10 +96,6 @@ def fit(self, X, y, feature_types=None, raise_errors=False): if not isinstance(y, pd.Series): y = pd.Series(y) - for col in X.columns: - if not is_numeric_dtype(X[col]): - raise ValueError("Input column '{}' contains non-numerical data".format(col)) - self._log_title("Beginning pipeline search") self._log("Optimizing for %s. " % self.objective.name, new_line=False) @@ -158,6 +153,9 @@ def _do_iteration(self, X, y, pbar, raise_errors): pbar.set_description("Testing %s" % (pipeline_class.name)) start = time.time() + # print(X) + # print(y) + # print(pipeline.pipeline) scores = [] all_objective_scores = [] for train, test in self.cv.split(X, y): From bde1cd851c89bd2725d6a93ce1f3099219539075 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 14:52:34 -0400 Subject: [PATCH 04/18] Added requiremtns --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index cf43ddcd84..3818470237 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ scikit-optimize[plots] colorama s3fs==0.2.2 joblib>=0.10.3 +category_encoders From 058cf4638721ee496e39995fa04d6ee58a9424fd Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 14:54:26 -0400 Subject: [PATCH 05/18] lint --- evalml/pipelines/classification/random_forest.py | 2 +- evalml/pipelines/classification/xgboost.py | 2 +- evalml/pipelines/regression/random_forest.py | 1 - evalml/tests/test_autoclassifier.py | 2 +- evalml/tests/test_autoregressor.py | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index 7f707b3475..ffd3249afb 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -26,7 +26,7 @@ class RFClassificationPipeline(PipelineBase): } def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, - percent_features, number_features, n_jobs=1, random_state=0): + percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index b15e6659a3..99bc2826b6 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -27,7 +27,7 @@ class XGBoostPipeline(PipelineBase): } def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, drop_invariant, - percent_features,number_features, n_jobs=1, random_state=0): + percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index c91c8493eb..491d11dacf 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -5,7 +5,6 @@ from sklearn.feature_selection import SelectFromModel from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder from skopt.space import Integer, Real from evalml.pipelines import PipelineBase diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index 540f735171..c03d8fc913 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -124,7 +124,7 @@ def test_categorical_auto(X_y_categorical_classification): clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) clf.fit(X.values, y, raise_errors=True) print(clf.rankings) - assert clf.rankings['score'].isnull().any() == False + assert clf.rankings['score'].isnull().any() is False def test_random_state(X_y): diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 4b92f75a6e..2c33a4fb81 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -55,7 +55,7 @@ def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0, model_types=["random_forest"]) clf.fit(X.values, y, raise_errors=True) - assert clf.rankings['score'].isnull().any() == False + assert clf.rankings['score'].isnull().any() is False def test_callback(X_y): From bb85d62f8fdec55b598df2d82107863ffd4f0a18 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 14:56:11 -0400 Subject: [PATCH 06/18] Remove prints --- evalml/models/auto_base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 91b41b6003..8cd450cab0 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -153,9 +153,6 @@ def _do_iteration(self, X, y, pbar, raise_errors): pbar.set_description("Testing %s" % (pipeline_class.name)) start = time.time() - # print(X) - # print(y) - # print(pipeline.pipeline) scores = [] all_objective_scores = [] for train, test in self.cv.split(X, y): From f323a4333e1f94abae31c36e0a09157773c14474 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 16:03:04 -0400 Subject: [PATCH 07/18] Fix tests --- evalml/pipelines/classification/logistic_regression.py | 2 +- evalml/pipelines/classification/random_forest.py | 2 +- evalml/pipelines/classification/xgboost.py | 2 +- evalml/pipelines/regression/random_forest.py | 2 +- evalml/tests/test_autoclassifier.py | 3 +-- evalml/tests/test_autoregressor.py | 2 +- evalml/tests/test_logistic_regression.py | 2 +- evalml/tests/test_objectives.py | 2 +- evalml/tests/test_pipelines.py | 6 +++--- evalml/tests/test_rf.py | 2 +- evalml/tests/test_xgboost.py | 2 +- 11 files changed, 13 insertions(+), 14 deletions(-) diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py index 4e33ab9ca6..fb6bd70873 100644 --- a/evalml/pipelines/classification/logistic_regression.py +++ b/evalml/pipelines/classification/logistic_regression.py @@ -27,7 +27,7 @@ class LogisticRegressionPipeline(PipelineBase): def __init__(self, objective, penalty, C, impute_strategy, drop_invariant, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) estimator = LogisticRegression(random_state=random_state, penalty=penalty, diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index ffd3249afb..cfd8497406 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -28,7 +28,7 @@ class RFClassificationPipeline(PipelineBase): def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) estimator = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators, diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index 99bc2826b6..eeefc68d47 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -29,7 +29,7 @@ class XGBoostPipeline(PipelineBase): def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, drop_invariant, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) estimator = XGBClassifier( random_state=random_state, diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index 491d11dacf..ae5cb3eddd 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -29,7 +29,7 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_inv number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=False) + enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) estimator = RandomForestRegressor(random_state=random_state, n_estimators=n_estimators, diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index c03d8fc913..ab49cebeee 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -123,8 +123,7 @@ def test_categorical_auto(X_y_categorical_classification): X, y = X_y_categorical_classification clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) clf.fit(X.values, y, raise_errors=True) - print(clf.rankings) - assert clf.rankings['score'].isnull().any() is False + assert not clf.rankings['score'].isnull().all() def test_random_state(X_y): diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 2c33a4fb81..633990cfc1 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -55,7 +55,7 @@ def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0, model_types=["random_forest"]) clf.fit(X.values, y, raise_errors=True) - assert clf.rankings['score'].isnull().any() is False + assert not clf.rankings['score'].isnull().all() def test_callback(X_y): diff --git a/evalml/tests/test_logistic_regression.py b/evalml/tests/test_logistic_regression.py index d98e41016e..44209ac880 100644 --- a/evalml/tests/test_logistic_regression.py +++ b/evalml/tests/test_logistic_regression.py @@ -7,7 +7,7 @@ def test_lr_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0])) + clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/test_objectives.py b/evalml/tests/test_objectives.py index 35df777b6a..a0d459bff5 100644 --- a/evalml/tests/test_objectives.py +++ b/evalml/tests/test_objectives.py @@ -26,7 +26,7 @@ def test_binary_average(X_y): X = pd.DataFrame(X) y = pd.Series(y) - pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', number_features=0) + pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=0) pipeline.fit(X, y) y_pred = pipeline.predict(X) diff --git a/evalml/tests/test_pipelines.py b/evalml/tests/test_pipelines.py index b549a1fdb9..3813b0c26e 100644 --- a/evalml/tests/test_pipelines.py +++ b/evalml/tests/test_pipelines.py @@ -43,7 +43,7 @@ def test_serialization(X_y, trained_model, path_management): path = os.path.join(path_management, 'pipe.pkl') objective = Precision() - pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0])) + pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0])) pipeline.fit(X, y) save_pipeline(pipeline, path) assert pipeline.score(X, y) == load_pipeline(path).score(X, y) @@ -60,10 +60,10 @@ def test_reproducibility(X_y): amount_col=10 ) - clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0) + clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0) clf.fit(X, y) - clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0) + clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0) clf_1.fit(X, y) assert clf_1.score(X, y) == clf.score(X, y) diff --git a/evalml/tests/test_rf.py b/evalml/tests/test_rf.py index 2e3ec42c59..c49f43c014 100644 --- a/evalml/tests/test_rf.py +++ b/evalml/tests/test_rf.py @@ -7,7 +7,7 @@ def test_rf_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0])) + clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/test_xgboost.py b/evalml/tests/test_xgboost.py index cc005f9302..05fed92641 100644 --- a/evalml/tests/test_xgboost.py +++ b/evalml/tests/test_xgboost.py @@ -7,7 +7,7 @@ def test_xg_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0])) + clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) From 9301e894bbb3cb2a00b8ce6d220b048dc8cc6983 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Fri, 13 Sep 2019 16:12:37 -0400 Subject: [PATCH 08/18] Clean up --- evalml/tests/test_autoclassifier.py | 2 +- evalml/tests/test_autoregressor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index ab49cebeee..def93c5a28 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -122,7 +122,7 @@ def test_multi_auto(X_y_multi): def test_categorical_auto(X_y_categorical_classification): X, y = X_y_categorical_classification clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) - clf.fit(X.values, y, raise_errors=True) + clf.fit(X.values, y) assert not clf.rankings['score'].isnull().all() diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 633990cfc1..6ef7dd93d7 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -53,8 +53,8 @@ def test_random_state(X_y): def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression - clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0, model_types=["random_forest"]) - clf.fit(X.values, y, raise_errors=True) + clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) + clf.fit(X.values, y) assert not clf.rankings['score'].isnull().all() From c38fb437fececcc6e8f46e1a01498743f05c6e2f Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Mon, 16 Sep 2019 17:44:24 -0400 Subject: [PATCH 09/18] Cleanup, fix feature_importance, and get rid of uncessary tuning --- evalml/pipelines/classification/logistic_regression.py | 5 ++--- evalml/pipelines/classification/random_forest.py | 5 ++--- evalml/pipelines/classification/xgboost.py | 5 ++--- evalml/pipelines/pipeline_base.py | 2 +- evalml/pipelines/regression/random_forest.py | 6 +++--- evalml/tests/test_autoregressor.py | 2 ++ evalml/tests/test_logistic_regression.py | 2 +- evalml/tests/test_objectives.py | 2 +- evalml/tests/test_pipelines.py | 6 +++--- evalml/tests/test_rf.py | 2 +- evalml/tests/test_xgboost.py | 2 +- 11 files changed, 19 insertions(+), 20 deletions(-) diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py index fb6bd70873..d77484fbb5 100644 --- a/evalml/pipelines/classification/logistic_regression.py +++ b/evalml/pipelines/classification/logistic_regression.py @@ -21,13 +21,12 @@ class LogisticRegressionPipeline(PipelineBase): "penalty": ["l2"], "C": Real(.01, 10), "impute_strategy": ["mean", "median", "most_frequent"], - "drop_invariant": [True, False] } - def __init__(self, objective, penalty, C, impute_strategy, drop_invariant, + def __init__(self, objective, penalty, C, impute_strategy, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = LogisticRegression(random_state=random_state, penalty=penalty, diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index cfd8497406..1827a886af 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -22,13 +22,12 @@ class RFClassificationPipeline(PipelineBase): "max_depth": Integer(1, 32), "impute_strategy": ["mean", "median", "most_frequent"], "percent_features": Real(.01, 1), - "drop_invariant": [True, False] } - def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, + def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators, diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index eeefc68d47..242fd0ea73 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -22,14 +22,13 @@ class XGBoostPipeline(PipelineBase): "min_child_weight": Real(1, 10), "max_depth": Integer(1, 20), "impute_strategy": ["mean", "median", "most_frequent"], - "drop_invariant": [True, False], "percent_features": Real(.01, 1) } - def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, drop_invariant, + def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = XGBClassifier( random_state=random_state, diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index d2dac014be..2906202f70 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -31,11 +31,11 @@ def fit(self, X, y, objective_fit_size=.2): if not isinstance(y, pd.Series): y = pd.Series(y) - self.input_feature_names = X.columns.tolist() if self.objective.needs_fitting: X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size, random_state=self.random_state) self.pipeline.fit(X, y) + self.input_feature_names = self.pipeline['encoder'].get_feature_names() if self.objective.needs_fitting: if self.objective.fit_needs_proba: diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index ae5cb3eddd..4c91f445f9 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -21,15 +21,14 @@ class RFRegressionPipeline(PipelineBase): "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 32), "impute_strategy": ["mean", "median", "most_frequent"], - "drop_invariant": [True, False], "percent_features": Real(.01, 1) } - def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, percent_features, + def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) - enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = RandomForestRegressor(random_state=random_state, n_estimators=n_estimators, @@ -55,6 +54,7 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_inv def feature_importances(self): """Return feature importances. Feature dropped by feaure selection are excluded""" indices = self.pipeline["feature_selection"].get_support(indices=True) + # need to fix inpujt_feature_names as it takes from orignal columns feature_names = list(map(lambda i: self.input_feature_names[i], indices)) importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_)) # note: this only works for binary importances.sort(key=lambda x: -abs(x[1])) diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 6ef7dd93d7..b1ba3fbc8b 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -56,6 +56,8 @@ def test_categorical(X_y_categorical_regression): clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) clf.fit(X.values, y) assert not clf.rankings['score'].isnull().all() + assert not clf.get_pipeline(0).feature_importances.isnull().all().all() + def test_callback(X_y): diff --git a/evalml/tests/test_logistic_regression.py b/evalml/tests/test_logistic_regression.py index 44209ac880..d98e41016e 100644 --- a/evalml/tests/test_logistic_regression.py +++ b/evalml/tests/test_logistic_regression.py @@ -7,7 +7,7 @@ def test_lr_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0])) + clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/test_objectives.py b/evalml/tests/test_objectives.py index a0d459bff5..35df777b6a 100644 --- a/evalml/tests/test_objectives.py +++ b/evalml/tests/test_objectives.py @@ -26,7 +26,7 @@ def test_binary_average(X_y): X = pd.DataFrame(X) y = pd.Series(y) - pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=0) + pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', number_features=0) pipeline.fit(X, y) y_pred = pipeline.predict(X) diff --git a/evalml/tests/test_pipelines.py b/evalml/tests/test_pipelines.py index 3813b0c26e..b549a1fdb9 100644 --- a/evalml/tests/test_pipelines.py +++ b/evalml/tests/test_pipelines.py @@ -43,7 +43,7 @@ def test_serialization(X_y, trained_model, path_management): path = os.path.join(path_management, 'pipe.pkl') objective = Precision() - pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0])) + pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0])) pipeline.fit(X, y) save_pipeline(pipeline, path) assert pipeline.score(X, y) == load_pipeline(path).score(X, y) @@ -60,10 +60,10 @@ def test_reproducibility(X_y): amount_col=10 ) - clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0) + clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0) clf.fit(X, y) - clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0) + clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0) clf_1.fit(X, y) assert clf_1.score(X, y) == clf.score(X, y) diff --git a/evalml/tests/test_rf.py b/evalml/tests/test_rf.py index c49f43c014..2e3ec42c59 100644 --- a/evalml/tests/test_rf.py +++ b/evalml/tests/test_rf.py @@ -7,7 +7,7 @@ def test_rf_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0])) + clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/test_xgboost.py b/evalml/tests/test_xgboost.py index 05fed92641..cc005f9302 100644 --- a/evalml/tests/test_xgboost.py +++ b/evalml/tests/test_xgboost.py @@ -7,7 +7,7 @@ def test_xg_multi(X_y_multi): X, y = X_y_multi objective = PrecisionMicro() - clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0])) + clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0])) clf.fit(X, y) clf.score(X, y) y_pred = clf.predict(X) From 5728f32dcccb3c9017ae2b460f59656301253b4e Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 10:45:08 -0400 Subject: [PATCH 10/18] set requirement ver --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3818470237..e4f436c2e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ scikit-optimize[plots] colorama s3fs==0.2.2 joblib>=0.10.3 -category_encoders +category_encoders==2.0.0 From c5d3b2d5c370b0140d975c0421906013bb98387f Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 10:53:07 -0400 Subject: [PATCH 11/18] Lint --- evalml/pipelines/classification/random_forest.py | 2 +- evalml/tests/test_autoregressor.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index 1827a886af..4f56eb13a2 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -21,7 +21,7 @@ class RFClassificationPipeline(PipelineBase): "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 32), "impute_strategy": ["mean", "median", "most_frequent"], - "percent_features": Real(.01, 1), + "percent_features": Real(.01, 1) } def __init__(self, objective, n_estimators, max_depth, impute_strategy, diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index b1ba3fbc8b..33989d5cf7 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -59,7 +59,6 @@ def test_categorical(X_y_categorical_regression): assert not clf.get_pipeline(0).feature_importances.isnull().all().all() - def test_callback(X_y): X, y = X_y From 3703fe1e7d2658301eaa48b7aab453fc8451da52 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 13:27:54 -0400 Subject: [PATCH 12/18] Fix tests --- evalml/models/auto_base.py | 1 - evalml/pipelines/pipeline_base.py | 2 +- evalml/tests/test_autoclassifier.py | 3 ++- evalml/tests/test_autoregressor.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 8cd450cab0..e181790dbc 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -160,7 +160,6 @@ def _do_iteration(self, X, y, pbar, raise_errors): X_train, X_test = X.iloc[train], X.iloc[test] else: X_train, X_test = X[train], X[test] - if isinstance(y, pd.Series): y_train, y_test = y.iloc[train], y.iloc[test] else: diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 2906202f70..2170a04e26 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -35,7 +35,7 @@ def fit(self, X, y, objective_fit_size=.2): X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size, random_state=self.random_state) self.pipeline.fit(X, y) - self.input_feature_names = self.pipeline['encoder'].get_feature_names() + self.input_feature_names = self.pipeline['encoder'].feature_names if self.objective.needs_fitting: if self.objective.fit_needs_proba: diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index def93c5a28..a3ee845ddb 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -122,8 +122,9 @@ def test_multi_auto(X_y_multi): def test_categorical_auto(X_y_categorical_classification): X, y = X_y_categorical_classification clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) - clf.fit(X.values, y) + clf.fit(X, y) assert not clf.rankings['score'].isnull().all() + assert not clf.get_pipeline(0).feature_importances.isnull().all().all() def test_random_state(X_y): diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 33989d5cf7..79cce08c12 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -54,7 +54,7 @@ def test_random_state(X_y): def test_categorical(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) - clf.fit(X.values, y) + clf.fit(X, y) assert not clf.rankings['score'].isnull().all() assert not clf.get_pipeline(0).feature_importances.isnull().all().all() From 00932be0d5bdeb72022042a23fc744f617c76ef5 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 13:33:13 -0400 Subject: [PATCH 13/18] Remove comment --- evalml/pipelines/regression/random_forest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index 4c91f445f9..4a46c3ac1f 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -54,7 +54,6 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ def feature_importances(self): """Return feature importances. Feature dropped by feaure selection are excluded""" indices = self.pipeline["feature_selection"].get_support(indices=True) - # need to fix inpujt_feature_names as it takes from orignal columns feature_names = list(map(lambda i: self.input_feature_names[i], indices)) importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_)) # note: this only works for binary importances.sort(key=lambda x: -abs(x[1])) From 690f14ee34f0c8fbff90c17f227ed48cf0864f63 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 13:34:39 -0400 Subject: [PATCH 14/18] Change test names' --- evalml/tests/test_autoclassifier.py | 2 +- evalml/tests/test_autoregressor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index a3ee845ddb..933d739ffd 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -119,7 +119,7 @@ def test_multi_auto(X_y_multi): assert clf.default_objectives == get_objectives('multiclass') -def test_categorical_auto(X_y_categorical_classification): +def test_categorical_classification(X_y_categorical_classification): X, y = X_y_categorical_classification clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) clf.fit(X, y) diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index 79cce08c12..738a2f8db8 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -51,7 +51,7 @@ def test_random_state(X_y): assert pd.testing.assert_frame_equal(clf.rankings, clf_1.rankings) is None -def test_categorical(X_y_categorical_regression): +def test_categorical_regression(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) clf.fit(X, y) From ef6e3c899d78f00efef4413d966a1c58567f4b66 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 16:34:57 -0400 Subject: [PATCH 15/18] Changed data to include categorical type --- evalml/tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 0782b960d4..29740382c0 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -25,6 +25,9 @@ def X_y_categorical_regression(): flights = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv') y = flights['tip'] X = flights.drop('tip', axis=1) + + # add categorical dtype + X['smoker'] = X['smoker'].astype('category') return X, y From c0f1006df037ea7f70f25c90f21a764ba4d66119 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Tue, 17 Sep 2019 17:28:30 -0400 Subject: [PATCH 16/18] set to min --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e4f436c2e5..e15bb34e8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ scikit-optimize[plots] colorama s3fs==0.2.2 joblib>=0.10.3 -category_encoders==2.0.0 +category_encoders>=2.0.0 From 02a072d6fab8dab206d4369cef30828d7f5f6834 Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Wed, 18 Sep 2019 10:09:05 -0400 Subject: [PATCH 17/18] Updated docs --- docs/source/automl/pipeline_search.ipynb | 2 +- docs/source/automl/regression_example.ipynb | 2 +- docs/source/automl/search_results.ipynb | 2 +- docs/source/demos/fraud.ipynb | 244 ++++++++++---------- docs/source/roadmap.rst | 3 + 5 files changed, 125 insertions(+), 128 deletions(-) diff --git a/docs/source/automl/pipeline_search.ipynb b/docs/source/automl/pipeline_search.ipynb index e1b0e6e79d..845902ec96 100644 --- a/docs/source/automl/pipeline_search.ipynb +++ b/docs/source/automl/pipeline_search.ipynb @@ -20,7 +20,7 @@ "source": [ "## How it works\n", "\n", - "EvalML selects and tunes machine learning pipelines built of numerous steps. This includes missing value imputation, feature selection, feature scaling, and finally machine learning. As EvalML tunes pipelines, it uses the objective function selected and configured by the user to guide its search. \n", + "EvalML selects and tunes machine learning pipelines built of numerous steps. This includes encoding categorical data, missing value imputation, feature selection, feature scaling, and finally machine learning. As EvalML tunes pipelines, it uses the objective function selected and configured by the user to guide its search. \n", "\n", "\n", "At each iteration, EvalML uses cross-validation to generate an estimate of the pipeline's performances. If a pipeline has high variance across cross-validation folds, it will provide a warning. In this case, the pipeline may not perform reliably in the future.\n", diff --git a/docs/source/automl/regression_example.ipynb b/docs/source/automl/regression_example.ipynb index afe3fb1ebe..9c47f21cfc 100644 --- a/docs/source/automl/regression_example.ipynb +++ b/docs/source/automl/regression_example.ipynb @@ -245,7 +245,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb index 407f7cdec3..9ccc9b627b 100644 --- a/docs/source/automl/search_results.ipynb +++ b/docs/source/automl/search_results.ipynb @@ -811,7 +811,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/demos/fraud.ipynb b/docs/source/demos/fraud.ipynb index 4332065931..28fd8517e7 100644 --- a/docs/source/demos/fraud.ipynb +++ b/docs/source/demos/fraud.ipynb @@ -90,16 +90,38 @@ "X, y = evalml.demos.load_fraud()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "EvalML natively supports one-hot encoding. Here we keep 1 out of the 6 categorical columns to decrease computation time." + ] + }, { "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "card_id int64\n", + "store_id int64\n", + "amount int64\n", + "currency object\n", + "customer_present bool\n", + "lat float64\n", + "lng float64\n", + "dtype: object\n" + ] + } + ], "source": [ - "# select numeric data before running AutoClassifer\n", - "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']\n", - "X = X.select_dtypes(include=numerics)\n", - "X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, test_size=.8, random_state=0)" + "X = X.drop(['datetime', 'expiration_date', 'country', 'region', 'provider'], axis=1)\n", + "X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, test_size=0.2, random_state=0)\n", + "\n", + "print(X.dtypes)" ] }, { @@ -124,11 +146,11 @@ "\n", "Optimizing for Fraud Cost. Lower score is better.\n", "\n", - "Searching up to 10 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", "\n", "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "Testing XGBoost w/ imputation: 100%|██████████| 10/10 [01:05<00:00, 6.53s/it] \n", + "Testing LogisticRegression w/ imputation + scaling: 100%|██████████| 5/5 [18:10<00:00, 218.05s/it]\n", "\n", "✔ Optimization finished\n" ] @@ -136,7 +158,8 @@ ], "source": [ "clf = evalml.AutoClassifier(objective=fraud_objective,\n", - " max_pipelines=10)\n", + " additional_objectives=['auc', 'recall', 'precision'],\n", + " max_pipelines=5)\n", "\n", "clf.fit(X_train, y_train)" ] @@ -185,43 +208,63 @@ " \n", " \n", " \n", - " 0\n", - " 9\n", + " 0\n", + " 1\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007623\n", + " False\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", + " \n", + " \n", + " 1\n", + " 4\n", + " LogisticRegressionPipeline\n", + " 0.007623\n", " False\n", - " {'eta': 0.6481718720511973, 'min_child_weight'...\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", - " 1\n", " 2\n", + " 0\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007623\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", - " 2\n", - " 8\n", + " 3\n", + " 3\n", + " XGBoostPipeline\n", + " 0.007623\n", + " False\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", + " \n", + " \n", + " 4\n", + " 2\n", " RFClassificationPipeline\n", - " 0.007614\n", + " 0.007623\n", " False\n", - " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", + " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id pipeline_name score high_variance_cv \\\n", - "0 9 XGBoostPipeline 0.007614 False \n", - "1 2 XGBoostPipeline 0.007614 False \n", - "2 8 RFClassificationPipeline 0.007614 False \n", + " id pipeline_name score high_variance_cv \\\n", + "0 1 XGBoostPipeline 0.007623 False \n", + "1 4 LogisticRegressionPipeline 0.007623 False \n", + "2 0 XGBoostPipeline 0.007623 False \n", + "3 3 XGBoostPipeline 0.007623 False \n", + "4 2 RFClassificationPipeline 0.007623 False \n", "\n", " parameters \n", - "0 {'eta': 0.6481718720511973, 'min_child_weight'... \n", - "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "2 {'n_estimators': 369, 'max_depth': 10, 'impute... " + "0 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "1 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "2 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "4 {'n_estimators': 569, 'max_depth': 22, 'impute... " ] }, "execution_count": 6, @@ -230,7 +273,7 @@ } ], "source": [ - "clf.rankings.head(3)" + "clf.rankings" ] }, { @@ -274,25 +317,25 @@ "Pipeline Name: XGBoost w/ imputation\n", "Model type: xgboost\n", "Objective: Fraud Cost (lower is better)\n", - "Total training time (including CV): 6.1 seconds\n", + "Total training time (including CV): 383.7 seconds\n", "\n", "Parameters\n", "==========\n", - "• eta: 0.6481718720511973\n", - "• min_child_weight: 4.314173858564932\n", - "• max_depth: 6\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.871312026764351\n", + "• eta: 0.38438170729269994\n", + "• min_child_weight: 3.677811458900251\n", + "• max_depth: 13\n", + "• impute_strategy: median\n", + "• percent_features: 0.793807787701838\n", "\n", "Cross Validation\n", "=================\n", - " F1 Precision Recall AUC Log Loss Fraud Cost # Training # Testing\n", - "0 0.264 0.152 0.264 0.841 0.192 0.008 13332.000 6666.000\n", - "1 0.264 0.152 0.264 0.845 0.191 0.008 13332.000 6666.000\n", - "2 0.264 0.152 0.264 0.834 0.202 0.008 13332.000 6666.000\n", - "mean 0.264 0.152 0.264 0.840 0.195 0.008 - -\n", - "std 0.000 0.000 0.000 0.006 0.006 0.000 - -\n", - "coef of var 0.000 0.000 0.000 0.007 0.029 0.002 - -\n" + " AUC Recall Precision Fraud Cost # Training # Testing\n", + "0 0.831 0.264 0.152 0.008 53328.000 26665.000\n", + "1 0.833 0.264 0.152 0.008 53328.000 26665.000\n", + "2 0.836 0.264 0.152 0.008 53330.000 26663.000\n", + "mean 0.834 0.264 0.152 0.008 - -\n", + "std 0.003 0.000 0.000 0.000 - -\n", + "coef of var 0.003 0.000 0.000 0.003 - -\n" ] } ], @@ -317,7 +360,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -344,8 +387,8 @@ { "data": { "text/plain": [ - "(0.007625590107402798,\n", - " {'AUC': 0.8497570761248703, 'Fraud Cost': 0.007625590107402798})" + "(0.007626457064581641,\n", + " {'AUC': 0.8336438887334185, 'Fraud Cost': 0.007626457064581641})" ] }, "execution_count": 10, @@ -381,11 +424,11 @@ "\n", "Optimizing for AUC. Greater score is better.\n", "\n", - "Searching up to 10 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", "\n", "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "Testing XGBoost w/ imputation: 100%|██████████| 10/10 [01:29<00:00, 8.96s/it] \n", + "Testing LogisticRegression w/ imputation + scaling: 100%|██████████| 5/5 [19:16<00:00, 231.26s/it]\n", "\n", "✔ Optimization finished\n" ] @@ -393,7 +436,8 @@ ], "source": [ "clf_auc = evalml.AutoClassifier(objective='auc',\n", - " max_pipelines=10)\n", + " additional_objectives=['recall', 'precision'],\n", + " max_pipelines=5)\n", "\n", "clf_auc.fit(X_train, y_train)" ] @@ -440,113 +484,63 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", + " 0\n", + " 2\n", " RFClassificationPipeline\n", - " 0.863238\n", + " 0.873053\n", " False\n", " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", - " 1\n", - " 9\n", + " 1\n", + " 0\n", " XGBoostPipeline\n", - " 0.852239\n", + " 0.849826\n", " False\n", - " {'eta': 0.38438170729269994, 'min_child_weight...\n", - " \n", - " \n", - " 2\n", - " 3\n", - " RFClassificationPipeline\n", - " 0.847514\n", - " False\n", - " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", - " \n", - " \n", - " 3\n", - " 8\n", - " RFClassificationPipeline\n", - " 0.846346\n", - " False\n", - " {'n_estimators': 715, 'max_depth': 7, 'impute_...\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", - " 4\n", " 2\n", + " 1\n", " XGBoostPipeline\n", - " 0.845902\n", + " 0.840634\n", " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", " \n", - " 5\n", - " 5\n", - " RFClassificationPipeline\n", - " 0.842745\n", + " 3\n", + " 3\n", + " XGBoostPipeline\n", + " 0.839091\n", " False\n", - " {'n_estimators': 609, 'max_depth': 7, 'impute_...\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", " \n", - " 6\n", + " 4\n", " 4\n", " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", - " \n", - " \n", - " 7\n", - " 0\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", + " 0.831181\n", " False\n", " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", - " \n", - " 8\n", - " 7\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 8.123565600467177, 'imp...\n", - " \n", - " \n", - " 9\n", - " 6\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", - " \n", " \n", "\n", "" ], "text/plain": [ " id pipeline_name score high_variance_cv \\\n", - "0 1 RFClassificationPipeline 0.863238 False \n", - "1 9 XGBoostPipeline 0.852239 False \n", - "2 3 RFClassificationPipeline 0.847514 False \n", - "3 8 RFClassificationPipeline 0.846346 False \n", - "4 2 XGBoostPipeline 0.845902 False \n", - "5 5 RFClassificationPipeline 0.842745 False \n", - "6 4 LogisticRegressionPipeline 0.838806 False \n", - "7 0 LogisticRegressionPipeline 0.838806 False \n", - "8 7 LogisticRegressionPipeline 0.838806 False \n", - "9 6 LogisticRegressionPipeline 0.838806 False \n", + "0 2 RFClassificationPipeline 0.873053 False \n", + "1 0 XGBoostPipeline 0.849826 False \n", + "2 1 XGBoostPipeline 0.840634 False \n", + "3 3 XGBoostPipeline 0.839091 False \n", + "4 4 LogisticRegressionPipeline 0.831181 False \n", "\n", " parameters \n", "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "1 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "2 {'n_estimators': 369, 'max_depth': 10, 'impute... \n", - "3 {'n_estimators': 715, 'max_depth': 7, 'impute_... \n", - "4 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "5 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", - "6 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", - "7 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "8 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", - "9 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... " + "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "2 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "4 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... " ] }, "execution_count": 12, @@ -566,7 +560,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 13, @@ -589,8 +583,8 @@ { "data": { "text/plain": [ - "(0.8619958322554153,\n", - " {'AUC': 0.8619958322554153, 'Fraud Cost': 0.03432590219090485})" + "(0.8745605699827037,\n", + " {'AUC': 0.8745605699827037, 'Fraud Cost': 0.03273490785793763})" ] }, "execution_count": 14, @@ -629,7 +623,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/roadmap.rst b/docs/source/roadmap.rst index 69bd353677..fe25b6d3ef 100644 --- a/docs/source/roadmap.rst +++ b/docs/source/roadmap.rst @@ -12,4 +12,7 @@ There are numerous new features and functionality planned for EvalML, some of wh * Instructions for adding your own modeling pipelines for EvalML to tune * Add additional hyperparameter tuning methods * Handle categorical data natively within EvalML + + * One-hot encoding has been incorparted natively within EvalMl + * Visualizations for understanding model search From fd321555b5394bc3d45a4550cea2ea06361db88b Mon Sep 17 00:00:00 2001 From: Jeremy Shih Date: Wed, 18 Sep 2019 10:51:40 -0400 Subject: [PATCH 18/18] Remove from road map --- docs/source/roadmap.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/source/roadmap.rst b/docs/source/roadmap.rst index fe25b6d3ef..29c6e48889 100644 --- a/docs/source/roadmap.rst +++ b/docs/source/roadmap.rst @@ -11,8 +11,4 @@ There are numerous new features and functionality planned for EvalML, some of wh * Ability to warm start from a previous pipeline search * Instructions for adding your own modeling pipelines for EvalML to tune * Add additional hyperparameter tuning methods -* Handle categorical data natively within EvalML - - * One-hot encoding has been incorparted natively within EvalMl - * Visualizations for understanding model search