Users can now pass in all valid kwargs to LogisticRegressionClassifie…

…r, ENClassifier, and Catboost estimators.
alteryx · Sep 11, 2020 · b7b86b4 · b7b86b4
1 parent 6d4e98e
commit b7b86b4
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 20 deletions.
diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
@@ -1,4 +1,5 @@
 import copy
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -31,12 +32,17 @@ class CatBoostClassifier(Estimator):
     SEED_MIN = 0
     SEED_MAX = SEED_BOUNDS.max_bound
 
-    def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0, **kwargs):
+    def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, silent=True,
+                 random_state=0, **kwargs):
         random_seed = get_random_seed(random_state, self.SEED_MIN, self.SEED_MAX)
         parameters = {"n_estimators": n_estimators,
                       "eta": eta,
                       "max_depth": max_depth,
-                      'bootstrap_type': bootstrap_type}
+                      'bootstrap_type': bootstrap_type,
+                      'silent': silent}
+        if kwargs.get('allow_writing_files', False):
+            warnings.warn("Parameter allow_writing_files is being set to False in CatBoostClassifier")
+        kwargs["allow_writing_files"] = False
         parameters.update(kwargs)
 
         cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
@@ -47,9 +53,7 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
         if bootstrap_type is None:
             cb_parameters.pop('bootstrap_type')
         cb_classifier = catboost.CatBoostClassifier(**cb_parameters,
-                                                    random_seed=random_seed,
-                                                    silent=True,
-                                                    allow_writing_files=False)
+                                                    random_seed=random_seed)
         super().__init__(parameters=parameters,
                          component_obj=cb_classifier,
                          random_state=random_state)

diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py
@@ -1,3 +1,5 @@
+import warnings
+
 import numpy as np
 from sklearn.linear_model import SGDClassifier as SKElasticNetClassifier
 from skopt.space import Real
@@ -17,16 +19,19 @@ class ElasticNetClassifier(Estimator):
     model_family = ModelFamily.LINEAR_MODEL
     supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
 
-    def __init__(self, alpha=0.5, l1_ratio=0.5, n_jobs=-1, max_iter=1000, random_state=0, **kwargs):
+    def __init__(self, alpha=0.5, l1_ratio=0.5, n_jobs=-1, max_iter=1000, random_state=0, penalty='elasticnet',
+                 **kwargs):
         parameters = {'alpha': alpha,
                       'l1_ratio': l1_ratio,
                       'n_jobs': n_jobs,
-                      'max_iter': max_iter}
+                      'max_iter': max_iter,
+                      'penalty': penalty}
+        if kwargs.get('loss', 'log') != 'log':
+            warnings.warn("Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities"
+                          f". Originally received '{kwargs['loss']}'.")
+        kwargs["loss"] = "log"
         parameters.update(kwargs)
-
-        en_classifier = SKElasticNetClassifier(loss="log",
-                                               penalty="elasticnet",
-                                               random_state=random_state,
+        en_classifier = SKElasticNetClassifier(random_state=random_state,
                                                **parameters)
         super().__init__(parameters=parameters,
                          component_obj=en_classifier,

diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression.py
@@ -26,8 +26,6 @@ def __init__(self, penalty="l2", C=1.0, n_jobs=-1, random_state=0, **kwargs):
         parameters.update(kwargs)
 
         lr_classifier = LogisticRegression(random_state=random_state,
-                                           multi_class="auto",
-                                           solver="lbfgs",
                                            **parameters)
         super().__init__(parameters=parameters,
                          component_obj=lr_classifier,

diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
@@ -1,4 +1,5 @@
 import copy
+import warnings
 
 import pandas as pd
 from skopt.space import Integer, Real
@@ -29,12 +30,17 @@ class CatBoostRegressor(Estimator):
     SEED_MIN = 0
     SEED_MAX = SEED_BOUNDS.max_bound
 
-    def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0, **kwargs):
+    def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, silent=False,
+                 random_state=0, **kwargs):
         random_seed = get_random_seed(random_state, self.SEED_MIN, self.SEED_MAX)
         parameters = {"n_estimators": n_estimators,
                       "eta": eta,
                       "max_depth": max_depth,
-                      'bootstrap_type': bootstrap_type}
+                      'bootstrap_type': bootstrap_type,
+                      'silent': silent}
+        if kwargs.get('allow_writing_files', False):
+            warnings.warn("Parameter allow_writing_files is being set to False in CatBoostRegressor")
+        kwargs["allow_writing_files"] = False
         parameters.update(kwargs)
 
         cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
@@ -44,9 +50,7 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
         if bootstrap_type is None:
             cb_parameters.pop('bootstrap_type')
         cb_regressor = catboost.CatBoostRegressor(**cb_parameters,
-                                                  random_seed=random_seed,
-                                                  silent=True,
-                                                  allow_writing_files=False)
+                                                  random_seed=random_seed)
         super().__init__(parameters=parameters,
                          component_obj=cb_regressor,
                          random_state=random_state)

diff --git a/evalml/tests/component_tests/test_catboost_classifier.py b/evalml/tests/component_tests/test_catboost_classifier.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import pytest
 from pytest import importorskip
 
 from evalml.pipelines.components import CatBoostClassifier
@@ -41,3 +42,14 @@ def randint(self, min_bound, max_bound):
     rng = make_mock_random_state(CatBoostClassifier.SEED_MAX)
     clf = CatBoostClassifier(n_estimators=1, max_depth=1, random_state=rng)
     clf.fit(X, y)
+
+
+def test_overwrite_allow_writing_files_parameter_in_kwargs():
+
+    with pytest.warns(expected_warning=UserWarning) as warnings:
+        cb = CatBoostClassifier(allow_writing_files=True)
+
+    assert len(warnings) == 1
+    # check that the message matches
+    assert warnings[0].message.args[0] == "Parameter allow_writing_files is being set to False in CatBoostClassifier"
+    assert not cb.parameters['allow_writing_files']
diff --git a/evalml/tests/component_tests/test_catboost_regressor.py b/evalml/tests/component_tests/test_catboost_regressor.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import pytest
 from pytest import importorskip
 
 from evalml.pipelines.components import CatBoostRegressor
@@ -41,3 +42,14 @@ def randint(self, min_bound, max_bound):
     rng = make_mock_random_state(CatBoostRegressor.SEED_MAX)
     clf = CatBoostRegressor(n_estimators=1, max_depth=1, random_state=rng)
     clf.fit(X, y)
+
+
+def test_overwrite_allow_writing_files_parameter_in_kwargs():
+
+    with pytest.warns(expected_warning=UserWarning) as warnings:
+        cb = CatBoostRegressor(allow_writing_files=True)
+
+    assert len(warnings) == 1
+    # check that the message matches
+    assert warnings[0].message.args[0] == "Parameter allow_writing_files is being set to False in CatBoostRegressor"
+    assert not cb.parameters['allow_writing_files']
diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -139,7 +139,7 @@ def test_describe_component():
     rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
     linear_regressor = LinearRegressor()
     assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1}}
-    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000}}
+    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}}
     assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}}
     assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
     assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
@@ -769,3 +769,15 @@ def test_serialization_protocol(mock_cloudpickle_dump, tmpdir):
     component.save(path, pickle_protocol=42)
     assert len(mock_cloudpickle_dump.call_args_list) == 1
     assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == 42
+
+
+@pytest.mark.parametrize("estimator_class", _all_estimators())
+def test_estimators_accept_all_kwargs(estimator_class):
+    estimator = estimator_class()
+    if estimator._component_obj is None:
+        pytest.skip(f"Skipping {estimator_class} because does not have component object.")
+    params = estimator._component_obj.get_params()
+    if estimator_class.model_family == ModelFamily.CATBOOST:
+        # Deleting because we call it random_state in our api
+        del params["random_seed"]
+    estimator_class(**params)
diff --git a/evalml/tests/component_tests/test_en_classifier.py b/evalml/tests/component_tests/test_en_classifier.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 from sklearn.linear_model import SGDClassifier as SKElasticNetClassifier
 
 from evalml.model_family import ModelFamily
@@ -18,7 +19,9 @@ def test_en_parameters():
         "alpha": 0.75,
         "l1_ratio": 0.5,
         'max_iter': 1000,
-        'n_jobs': -1
+        'n_jobs': -1,
+        'penalty': 'elasticnet',
+        'loss': 'log'
     }
     assert clf.parameters == expected_parameters
 
@@ -107,3 +110,16 @@ def test_feature_importance_multi(X_y_multi):
     sk_features = np.linalg.norm(sk_clf.coef_, axis=0, ord=2)
 
     np.testing.assert_almost_equal(sk_features, clf.feature_importance, decimal=5)
+
+
+def test_overwrite_loss_parameter_in_kwargs():
+
+    with pytest.warns(expected_warning=UserWarning) as warnings:
+        en = ElasticNetClassifier(loss="hinge")
+
+    assert len(warnings) == 1
+    # check that the message matches
+    assert warnings[0].message.args[0] == ("Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities"
+                                           ". Originally received 'hinge'.")
+
+    assert en.parameters['loss'] == 'log'