alteryx · eccabay · May 29, 2020 · May 20, 2020 · May 20, 2020 · May 20, 2020
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -76,6 +76,8 @@ Classification Pipelines
 
     CatBoostBinaryClassificationPipeline
     CatBoostMulticlassClassificationPipeline
+    ETBinaryClassificationPipeline
+    ETMulticlassClassificationPipeline
     LogisticRegressionBinaryPipeline
     LogisticRegressionMulticlassPipeline
     RFBinaryClassificationPipeline
@@ -98,6 +100,7 @@ Regression Pipelines
 
     RFRegressionPipeline
     CatBoostRegressionPipeline
+    ETRegressionPipeline
     LinearRegressionPipeline
     XGBoostRegressionPipeline
     BaselineRegressionPipeline
@@ -178,6 +181,7 @@ Classifiers are components that output a predicted class label.
     :nosignatures:
 
     CatBoostClassifier
+    ExtraTreesClassifier
     RandomForestClassifier
     LogisticRegressionClassifier
     XGBoostClassifier
@@ -195,6 +199,7 @@ Regressors are components that output a predicted target value.
 
     CatBoostRegressor
     LinearRegressor
+    ExtraTreesRegressor
     RandomForestRegressor
     XGBoostRegressor
     BaselineRegressor

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -7,6 +7,7 @@ Changelog
         * Added baseline models for classification and regression, add functionality to calculate baseline models before searching in AutoML :pr:`746`
         * Port over highly-null guardrail as a data check and define `DefaultDataChecks` and `DisableDataChecks` classes :pr:`745`
         * Update `Tuner` classes to work directly with pipeline parameters dicts instead of flat parameter lists :pr:`779`
+        * Added new Pipeline option `ExtraTrees` :pr:`790`
         * Added precicion-recall curve metrics and plot for binary classification problems in `evalml.pipeline.graph_utils` :pr:`794`
     * Fixes
         * Update pipeline `score` to return `nan` score for any objective which throws an exception during scoring :pr:`787`
@@ -36,6 +37,7 @@ Changelog
         * Added unit tests for fraud cost, lead scoring, and standard metric objectives :pr:`741`
         * Update codecov client :pr:`782`
         * Updated AutoBase __str__ test to include no parameters case :pr:`783`
+        * Added unit tests for `ExtraTrees` pipeline :pr:`790`
         * If codecov fails to upload, fail build :pr:`810`
         * Updated Python version of dependency action :pr:`816`
         * Update the dependency update bot to use a suffix when creating branches :pr:`817`

diff --git a/evalml/model_family/model_family.py b/evalml/model_family/model_family.py
@@ -7,6 +7,7 @@ class ModelFamily(Enum):
     XGBOOST = 'xgboost'
     LINEAR_MODEL = 'linear_model'
     CATBOOST = 'catboost'
+    EXTRA_TREES = 'extra_trees'
     BASELINE = 'baseline'
     NONE = 'none'
 
@@ -15,6 +16,7 @@ def __str__(self):
                              ModelFamily.XGBOOST.name: "XGBoost",
                              ModelFamily.LINEAR_MODEL.name: "Linear",
                              ModelFamily.CATBOOST.name: "CatBoost",
+                             ModelFamily.EXTRA_TREES.name: "Extra Trees",
                              ModelFamily.BASELINE.name: "Baseline",
                              ModelFamily.NONE.name: "None"}
         return model_family_dict[self.name]
diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py
@@ -16,7 +16,9 @@
     RFClassifierSelectFromModel,
     RFRegressorSelectFromModel,
     CatBoostClassifier,
-    CatBoostRegressor
+    CatBoostRegressor,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor
 )
 
 from .pipeline_base import PipelineBase
@@ -28,6 +30,8 @@
 from .classification import (
     CatBoostBinaryClassificationPipeline,
     CatBoostMulticlassClassificationPipeline,
+    ETBinaryClassificationPipeline,
+    ETMulticlassClassificationPipeline,
     LogisticRegressionBinaryPipeline,
     LogisticRegressionMulticlassPipeline,
     RFBinaryClassificationPipeline,
@@ -45,6 +49,7 @@
     RFRegressionPipeline,
     CatBoostRegressionPipeline,
     XGBoostRegressionPipeline,
+    ETRegressionPipeline,
     BaselineRegressionPipeline,
     MeanBaselineRegressionPipeline
 )

diff --git a/evalml/pipelines/classification/__init__.py b/evalml/pipelines/classification/__init__.py
@@ -7,5 +7,7 @@
 from .catboost_multiclass import CatBoostMulticlassClassificationPipeline
 from .random_forest_binary import RFBinaryClassificationPipeline
 from .random_forest_multiclass import RFMulticlassClassificationPipeline
+from .extra_trees_binary import ETBinaryClassificationPipeline
+from .extra_trees_multiclass import ETMulticlassClassificationPipeline
 from .baseline_binary import BaselineBinaryPipeline, ModeBaselineBinaryPipeline
 from .baseline_multiclass import BaselineMulticlassPipeline, ModeBaselineMulticlassPipeline
diff --git a/evalml/pipelines/classification/extra_trees_binary.py b/evalml/pipelines/classification/extra_trees_binary.py
@@ -0,0 +1,7 @@
+from evalml.pipelines import BinaryClassificationPipeline
+
+
+class ETBinaryClassificationPipeline(BinaryClassificationPipeline):
+    """Extra Trees Pipeline for binary classification"""
+    custom_name = "Extra Trees Binary Classification Pipeline"
+    component_graph = ['One Hot Encoder', 'Simple Imputer', 'Extra Trees Classifier']
diff --git a/evalml/pipelines/classification/extra_trees_multiclass.py b/evalml/pipelines/classification/extra_trees_multiclass.py
@@ -0,0 +1,7 @@
+from evalml.pipelines import MulticlassClassificationPipeline
+
+
+class ETMulticlassClassificationPipeline(MulticlassClassificationPipeline):
+    """Extra Trees Pipeline for multiclass classification"""
+    custom_name = "Extra Trees Multiclass Classification Pipeline"
+    component_graph = ['One Hot Encoder', 'Simple Imputer', 'Extra Trees Classifier']
diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -8,6 +8,8 @@
     RandomForestRegressor,
     XGBoostClassifier,
     CatBoostClassifier,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
     CatBoostRegressor,
     XGBoostRegressor,
     BaselineClassifier,

diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py
@@ -4,9 +4,11 @@
                           RandomForestClassifier,
                           XGBoostClassifier,
                           CatBoostClassifier,
+                          ExtraTreesClassifier,
                           BaselineClassifier)
 from .regressors import (LinearRegressor,
                          RandomForestRegressor,
                          CatBoostRegressor,
                          XGBoostRegressor,
+                         ExtraTreesRegressor,
                          BaselineRegressor)
diff --git a/evalml/pipelines/components/estimators/classifiers/__init__.py b/evalml/pipelines/components/estimators/classifiers/__init__.py
@@ -3,4 +3,5 @@
 from .rf_classifier import RandomForestClassifier
 from .xgboost_classifier import XGBoostClassifier
 from .catboost_classifier import CatBoostClassifier
+from .et_classifier import ExtraTreesClassifier
 from .baseline_classifier import BaselineClassifier
diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py
@@ -0,0 +1,40 @@
+from sklearn.ensemble import ExtraTreesClassifier as SKExtraTreesClassifier
+from skopt.space import Integer
+
+from evalml.model_family import ModelFamily
+from evalml.pipelines.components.estimators import Estimator
+from evalml.problem_types import ProblemTypes
+
+
+class ExtraTreesClassifier(Estimator):
+    """Extra Trees Classifier"""
+    name = "Extra Trees Classifier"
+    hyperparameter_ranges = {
+        "n_estimators": Integer(10, 1000),
+        "max_features": ["auto", "sqrt", "log2"],
+        "max_depth": Integer(4, 10)
+    }
+    model_family = ModelFamily.EXTRA_TREES
+    supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
+
+    def __init__(self,
+                 n_estimators=100,
+                 max_features="auto",
+                 max_depth=6,
+                 min_samples_split=2,
+                 min_weight_fraction_leaf=0.0,
+                 n_jobs=-1,
+                 random_state=0):
+        parameters = {"n_estimators": n_estimators,
+                      "max_features": max_features,
+                      "max_depth": max_depth}
+        et_classifier = SKExtraTreesClassifier(n_estimators=n_estimators,
+                                               max_features=max_features,
+                                               max_depth=max_depth,
+                                               min_samples_split=min_samples_split,
+                                               min_weight_fraction_leaf=min_weight_fraction_leaf,
+                                               n_jobs=n_jobs,
+                                               random_state=random_state)
+        super().__init__(parameters=parameters,
+                         component_obj=et_classifier,
+                         random_state=random_state)
diff --git a/evalml/pipelines/components/estimators/regressors/__init__.py b/evalml/pipelines/components/estimators/regressors/__init__.py
@@ -3,4 +3,5 @@
 from .rf_regressor import RandomForestRegressor
 from .catboost_regressor import CatBoostRegressor
 from .xgboost_regressor import XGBoostRegressor
+from .et_regressor import ExtraTreesRegressor
 from .baseline_regressor import BaselineRegressor
diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py
@@ -0,0 +1,40 @@
+from sklearn.ensemble import ExtraTreesRegressor as SKExtraTreesRegressor
+from skopt.space import Integer
+
+from evalml.model_family import ModelFamily
+from evalml.pipelines.components.estimators import Estimator
+from evalml.problem_types import ProblemTypes
+
+
+class ExtraTreesRegressor(Estimator):
+    """Extra Trees Regressor"""
+    name = "Extra Trees Regressor"
+    hyperparameter_ranges = {
+        "n_estimators": Integer(10, 1000),
+        "max_features": ["auto", "sqrt", "log2"],
+        "max_depth": Integer(4, 10)
+    }
+    model_family = ModelFamily.EXTRA_TREES
+    supported_problem_types = [ProblemTypes.REGRESSION]
+
+    def __init__(self,
+                 n_estimators=100,
+                 max_features="auto",
+                 max_depth=6,
+                 min_samples_split=2,
+                 min_weight_fraction_leaf=0.0,
+                 n_jobs=-1,
+                 random_state=0):
+        parameters = {"n_estimators": n_estimators,
+                      "max_features": max_features,
+                      "max_depth": max_depth}
+        et_regressor = SKExtraTreesRegressor(random_state=random_state,
+                                             n_estimators=n_estimators,
+                                             max_features=max_features,
+                                             max_depth=max_depth,
+                                             min_samples_split=min_samples_split,
+                                             min_weight_fraction_leaf=min_weight_fraction_leaf,
+                                             n_jobs=n_jobs)
+        super().__init__(parameters=parameters,
+                         component_obj=et_regressor,
+                         random_state=random_state)
diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py
@@ -10,6 +10,8 @@
     CatBoostClassifier,
     CatBoostRegressor,
     Estimator,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
     LinearRegressor,
     LogisticRegressionClassifier,
     RandomForestClassifier,

diff --git a/evalml/pipelines/regression/__init__.py b/evalml/pipelines/regression/__init__.py
@@ -3,4 +3,5 @@
 from .random_forest import RFRegressionPipeline
 from .catboost import CatBoostRegressionPipeline
 from .xgboost_regression import XGBoostRegressionPipeline
+from .extra_trees import ETRegressionPipeline
 from .baseline_regression import BaselineRegressionPipeline, MeanBaselineRegressionPipeline
diff --git a/evalml/pipelines/regression/extra_trees.py b/evalml/pipelines/regression/extra_trees.py
@@ -0,0 +1,7 @@
+from evalml.pipelines import RegressionPipeline
+
+
+class ETRegressionPipeline(RegressionPipeline):
+    """Extra Trees Pipeline for regression problems"""
+    custom_name = "Extra Trees Regression Pipeline"
+    component_graph = ['One Hot Encoder', 'Simple Imputer', 'Extra Trees Regressor']
diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -6,6 +6,8 @@
 from evalml.pipelines.components import (
     ComponentBase,
     Estimator,
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
     LinearRegressor,
     LogisticRegressionClassifier,
     OneHotEncoder,
@@ -66,10 +68,14 @@ def test_describe_component():
 
     # testing estimators
     lr_classifier = LogisticRegressionClassifier()
+    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
+    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
     rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
     rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
     linear_regressor = LinearRegressor()
     assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'C': 1.0, 'penalty': 'l2'}}
+    assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'max_depth': 6, 'max_features': "auto", 'n_estimators': 10}}
+    assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'max_depth': 6, 'max_features': "auto", 'n_estimators': 10}}
     assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'max_depth': 3, 'n_estimators': 10}}
     assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'max_depth': 3, 'n_estimators': 10}}
     assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False}}

diff --git a/evalml/tests/component_tests/test_et_classifier.py b/evalml/tests/component_tests/test_et_classifier.py
@@ -0,0 +1,82 @@
+import numpy as np
+import pytest
+from sklearn.ensemble import ExtraTreesClassifier as SKExtraTreesClassifier
+
+from evalml.exceptions import MethodPropertyNotFoundError
+from evalml.model_family import ModelFamily
+from evalml.pipelines import ExtraTreesClassifier
+from evalml.problem_types import ProblemTypes
+
+
+def test_model_family():
+    assert ExtraTreesClassifier.model_family == ModelFamily.EXTRA_TREES
+
+
+def test_problem_types():
+    assert ProblemTypes.BINARY in ExtraTreesClassifier.supported_problem_types
+    assert ProblemTypes.MULTICLASS in ExtraTreesClassifier.supported_problem_types
+    assert len(ExtraTreesClassifier.supported_problem_types) == 2
+
+
+def test_et_parameters():
+
+    clf = ExtraTreesClassifier(n_estimators=20, max_features="auto", max_depth=5, random_state=2)
+    expected_parameters = {
+        "n_estimators": 20,
+        "max_features": "auto",
+        "max_depth": 5
+    }
+
+    assert clf.parameters == expected_parameters
+
+
+def test_fit_predict_binary(X_y):
+    X, y = X_y
+
+    sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0)
+    sk_clf.fit(X, y)
+    y_pred_sk = sk_clf.predict(X)
+    y_pred_proba_sk = sk_clf.predict_proba(X)
+
+    clf = ExtraTreesClassifier()
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+    y_pred_proba = clf.predict_proba(X)
+
+    np.testing.assert_almost_equal(y_pred, y_pred_sk, decimal=5)
+    np.testing.assert_almost_equal(y_pred_proba, y_pred_proba_sk, decimal=5)
+
+
+def test_fit_predict_multi(X_y_multi):
+    X, y = X_y_multi
+
+    sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0)
+    sk_clf.fit(X, y)
+    y_pred_sk = sk_clf.predict(X)
+    y_pred_proba_sk = sk_clf.predict_proba(X)
+
+    clf = ExtraTreesClassifier()
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+    y_pred_proba = clf.predict_proba(X)
+
+    np.testing.assert_almost_equal(y_pred, y_pred_sk, decimal=5)
+    np.testing.assert_almost_equal(y_pred_proba, y_pred_proba_sk, decimal=5)
+
+
+def test_feature_importances(X_y):
+    X, y = X_y
+
+    # testing that feature importances can't be called before fit
+    clf = ExtraTreesClassifier()
+    with pytest.raises(MethodPropertyNotFoundError):
+        feature_importances = clf.feature_importances
+
+    sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0)
+    sk_clf.fit(X, y)
+    sk_feature_importances = sk_clf.feature_importances_
+
+    clf.fit(X, y)
+    feature_importances = clf.feature_importances
+
+    np.testing.assert_almost_equal(sk_feature_importances, feature_importances, decimal=5)