WIP: Pipeline v2 (#108)

* adding skeleton for component_base * oops, added nested components folders, fixing * linting * adding skeleton estimator/transfomer classes * adding basic estimator components for merge merge * WIP: Components (#107) * Added imputer * Clean up imputer * added onehot and standard scaler * Need fix selectfrommodel and validating * Add selectfrommodel * Cleanup and added basic init test * lint * cleaning up, more merging, combining tests * adding componenttype enum, a little more merging * linting * Moved to estimator * lint fix :P * pipeline v2 base * pushing new base for pipeline and simple test * Faulty scaler for LR * Working LR * Broken RF * Fixed SelectFromModel * Added RF pipeline Fixed SelectFromModel * changing xgboost to use our components * Added RF Regression * beginning to fix broken pieces * continuing to fix tests * fixing tests in autoclassifier * linting * fixing silly typo bug * linting * cleaning up pipeline and pipelinebase classes * adding describe to components and pipeline * linting and fixing minor bug * adding check for estimator in pipeline * Pipeline indexing (#118) * Added indexing and basic tests * Switched to pipelinebase for slicing * Clean up and add docstrings * lint * lint again * Clean up and add error for setting * adding default value to next() to prevent StopIteration error * oops, actually fixing... * fixing docstrings and cleaning imports * adding simple tests for describe * linting * Autogenerated pipeline names (#122) * Basic name without check * Add assert * lint * Changed name format and name constants * Cleanup * moving files to subfolders and removing hyperparameters as class var * updating file hierarchy * linting * removing duplicate * Add feature_importance tests * adding extra tests * adding test, cleanup * Added linear regression and test for pipeling fitting (#131) * Added linear regression pipeline and added test for fitting * Remove unnecessary fit for xgboost component (#148) * Remove fit for xgboost * Remove kwargs * addressing pr comments: rename, abstract feature_importances, del __init__, cleanup, etc. (#149) * addressing pr comments: rename, del __init__, cleanup * cleaning up describe * feature importances + cleanup of components, added subclasses encoder + feature_selector * linting, fixing errors * adding less specific version * addressing comments * import errors * String and component_type component (#153) * added handling str and component type for component list * Adding model_type / problem_type to PipelineBase to allow inference (#157) * adding problem_type and model_type to pipeline base * problem_type --> problem_types * cleaning up self.component_list and init pipeline, typo * forgot to remove print * removing comments * changelog * removing generic SelectFromModel * Jeremy changes (#164) * cleanup components utils * Switch to category_encoder and cleanup RF Select * add feature_importance to estimator * More switching to CE * Move parameters to class attributes and make test * lint * Separate changelog test * Text cleanup * wip: addressing comments (#159) * wip: addressing comments * feature names? * adding fix for no estimator in generate_name * fixing test * addressing more comments on feature_importance * feature_importances fixed and cleaned :) * oops, missed merge conflict * change of name * minor cleanup * adding basic test for two feature selectors and adding default components in DEFAULT_COMPONENTS * adding tests for retaining feature names in input_feature_names * addressing comments * cleanup * fixing
alteryx · Nov 4, 2019 · 4a58a11 · 4a58a11
1 parent e412fdb
commit 4a58a11
Show file tree

Hide file tree

Showing 47 changed files with 1,471 additions and 213 deletions.
diff --git a/.gitignore b/.gitignore
@@ -110,3 +110,4 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+.DS_Store
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -7,6 +7,7 @@ Changelog
         * Added support for unlimited pipelines with a max_time limit :pr:`70`
     * Fixes
     * Changes
+        * Refactoring pipelines :pr:`108`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py
@@ -164,7 +164,6 @@ def _do_iteration(self, X, y, pbar, raise_errors):
 
         # propose the next best parameters for this piepline
         parameters = self._propose_parameters(pipeline_class)
-
         # fit an score the pipeline
         pipeline = pipeline_class(
             objective=self.objective,
@@ -199,7 +198,6 @@ def _do_iteration(self, X, y, pbar, raise_errors):
             try:
                 pipeline.fit(X_train, y_train)
                 score, other_scores = pipeline.score(X_test, y_test, other_objectives=self.additional_objectives)
-
             except Exception as e:
                 if raise_errors:
                     raise e

diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py
@@ -1,11 +1,29 @@
 # flake8:noqa
+from .components import (
+    Estimator,
+    OneHotEncoder,
+    SimpleImputer,
+    StandardScaler,
+    Transformer,
+    LinearRegressor,
+    LogisticRegressionClassifier,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    XGBoostClassifier,
+    ComponentTypes,
+    FeatureSelector,
+    CategoricalEncoder,
+    RFClassifierSelectFromModel,
+    RFRegressorSelectFromModel
+)
+
 from .pipeline_base import PipelineBase
 from .classification import (
     LogisticRegressionPipeline,
     RFClassificationPipeline,
     XGBoostPipeline
 )
-from .regression import RFRegressionPipeline
+from .regression import LinearRegressionPipeline, RFRegressionPipeline
 from .utils import (
     get_pipelines,
     list_model_types,

diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py
@@ -1,20 +1,19 @@
-import category_encoders as ce
-import numpy as np
-import pandas as pd
-from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
 from skopt.space import Real
 
 from evalml.model_types import ModelTypes
 from evalml.pipelines import PipelineBase
+from evalml.pipelines.components import (
+    LogisticRegressionClassifier,
+    OneHotEncoder,
+    SimpleImputer,
+    StandardScaler
+)
 from evalml.problem_types import ProblemTypes
 
 
 class LogisticRegressionPipeline(PipelineBase):
     """Logistic Regression Pipeline for both binary and multiclass classification"""
-    name = "LogisticRegression w/ imputation + scaling"
+    name = "Logistic Regression Classifier w/ One Hot Encoder + Simple Imputer + Standard Scaler"
     model_type = ModelTypes.LINEAR_MODEL
     problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
 
@@ -25,39 +24,15 @@ class LogisticRegressionPipeline(PipelineBase):
     }
 
     def __init__(self, objective, penalty, C, impute_strategy,
-                 number_features, n_jobs=1, random_state=0):
-        imputer = SimpleImputer(strategy=impute_strategy)
-        enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)
-
-        estimator = LogisticRegression(random_state=random_state,
-                                       penalty=penalty,
-                                       C=C,
-                                       multi_class='auto',
-                                       solver="lbfgs",
-                                       n_jobs=-1)
-
-        self.pipeline = Pipeline(
-            [("encoder", enc),
-             ("imputer", imputer),
-             ("scaler", StandardScaler()),
-             ("estimator", estimator)]
-        )
-
-        super().__init__(objective=objective, random_state=random_state)
-
-    @property
-    def feature_importances(self):
-        """Return feature importances. Feature dropped by feaure selection are excluded"""
-        coef_ = self.pipeline["estimator"].coef_
-
-        # binary classification case
-        if len(coef_) <= 2:
-            importances = list(zip(self.input_feature_names, coef_[0]))
-            importances.sort(key=lambda x: -abs(x[1]))
-        else:
-            # mutliclass classification case
-            importances = list(zip(self.input_feature_names, np.linalg.norm(coef_, axis=0, ord=2)))
-            importances.sort(key=lambda x: -(x[1]))
-
-        df = pd.DataFrame(importances, columns=["feature", "importance"])
-        return df
+                 number_features, n_jobs=-1, random_state=0):
+
+        imputer = SimpleImputer(impute_strategy=impute_strategy)
+        enc = OneHotEncoder()
+        scaler = StandardScaler()
+        estimator = LogisticRegressionClassifier(random_state=random_state,
+                                                 penalty=penalty,
+                                                 C=C,
+                                                 n_jobs=-1)
+
+        super().__init__(objective=objective,
+                         component_list=[enc, imputer, scaler, estimator])
diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py
@@ -1,20 +1,20 @@
-import category_encoders as ce
 import numpy as np
-import pandas as pd
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.feature_selection import SelectFromModel
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
 from skopt.space import Integer, Real
 
 from evalml.model_types import ModelTypes
 from evalml.pipelines import PipelineBase
+from evalml.pipelines.components import (
+    OneHotEncoder,
+    RandomForestClassifier,
+    RFClassifierSelectFromModel,
+    SimpleImputer
+)
 from evalml.problem_types import ProblemTypes
 
 
 class RFClassificationPipeline(PipelineBase):
     """Random Forest Pipeline for both binary and multiclass classification"""
-    name = "Random Forest w/ imputation"
+    name = "Random Forest Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model"
     model_type = ModelTypes.RANDOM_FOREST
     problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
 
@@ -27,36 +27,19 @@ class RFClassificationPipeline(PipelineBase):
 
     def __init__(self, objective, n_estimators, max_depth, impute_strategy,
                  percent_features, number_features, n_jobs=1, random_state=0):
-        imputer = SimpleImputer(strategy=impute_strategy)
-        enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)
 
-        estimator = RandomForestClassifier(random_state=random_state,
-                                           n_estimators=n_estimators,
+        imputer = SimpleImputer(impute_strategy=impute_strategy)
+        enc = OneHotEncoder()
+        estimator = RandomForestClassifier(n_estimators=n_estimators,
                                            max_depth=max_depth,
-                                           n_jobs=n_jobs)
-
-        feature_selection = SelectFromModel(
-            estimator=estimator,
-            max_features=max(1, int(percent_features * number_features)),
-            threshold=-np.inf
-        )
-
-        self.pipeline = Pipeline(
-            [("encoder", enc),
-             ("imputer", imputer),
-             ("feature_selection", feature_selection),
-             ("estimator", estimator)]
-        )
-
-        super().__init__(objective=objective, random_state=random_state)
-
-    @property
-    def feature_importances(self):
-        """Return feature importances. Feature dropped by feaure selection are excluded"""
-        indices = self.pipeline["feature_selection"].get_support(indices=True)
-        feature_names = list(map(lambda i: self.input_feature_names[i], indices))
-        importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_))
-        importances.sort(key=lambda x: -abs(x[1]))
-
-        df = pd.DataFrame(importances, columns=["feature", "importance"])
-        return df
+                                           n_jobs=n_jobs,
+                                           random_state=random_state)
+        feature_selection = RFClassifierSelectFromModel(n_estimators=n_estimators,
+                                                        max_depth=max_depth,
+                                                        number_features=number_features,
+                                                        percent_features=percent_features,
+                                                        threshold=-np.inf,
+                                                        random_state=random_state)
+
+        super().__init__(objective=objective,
+                         component_list=[enc, imputer, feature_selection, estimator])
diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py
@@ -1,20 +1,20 @@
-import category_encoders as ce
 import numpy as np
-import pandas as pd
-from sklearn.feature_selection import SelectFromModel
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
 from skopt.space import Integer, Real
-from xgboost import XGBClassifier
 
 from evalml.model_types import ModelTypes
 from evalml.pipelines import PipelineBase
+from evalml.pipelines.components import (
+    OneHotEncoder,
+    RFClassifierSelectFromModel,
+    SimpleImputer,
+    XGBoostClassifier
+)
 from evalml.problem_types import ProblemTypes
 
 
 class XGBoostPipeline(PipelineBase):
     """XGBoost Pipeline for both binary and multiclass classification"""
-    name = "XGBoost w/ imputation"
+    name = "XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model"
     model_type = ModelTypes.XGBOOST
     problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
 
@@ -27,68 +27,20 @@ class XGBoostPipeline(PipelineBase):
     }
 
     def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy,
-                 percent_features, number_features, n_jobs=1, random_state=0):
-        imputer = SimpleImputer(strategy=impute_strategy)
-        enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)
-
-        estimator = XGBClassifier(
-            random_state=random_state,
-            eta=eta,
-            max_depth=max_depth,
-            min_child_weight=min_child_weight
-        )
-
-        feature_selection = SelectFromModel(
-            estimator=estimator,
-            max_features=max(1, int(percent_features * number_features)),
-            threshold=-np.inf
-        )
-
-        self.pipeline = Pipeline(
-            [("encoder", enc),
-             ("imputer", imputer),
-             ("feature_selection", feature_selection),
-             ("estimator", estimator)]
-        )
-
-        super().__init__(objective=objective, random_state=random_state)
-
-    # Need to override fit for multiclass
-    def fit(self, X, y, objective_fit_size=.2):
-        """Build a model
-
-        Arguments:
-            X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
-
-            y (pd.Series): the target training labels of length [n_samples]
-
-        Returns:
-
-            self
-
-        """
-        # check if problem is multiclass
-        num_classes = len(np.unique(y))
-        if num_classes > 2:
-            params = self.pipeline['estimator'].get_params()
-            params.update(
-                {
-                    "objective": 'multi:softprob',
-                    "num_class": num_classes
-                })
-
-            estimator = XGBClassifier(**params)
-            self.pipeline.steps[-1] = ('estimator', estimator)
-
-        return super().fit(X, y, objective_fit_size)
-
-    @property
-    def feature_importances(self):
-        """Return feature importances. Feature dropped by feaure selection are excluded"""
-        indices = self.pipeline["feature_selection"].get_support(indices=True)
-        feature_names = list(map(lambda i: self.input_feature_names[i], indices))
-        importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_))
-        importances.sort(key=lambda x: -abs(x[1]))
-
-        df = pd.DataFrame(importances, columns=["feature", "importance"])
-        return df
+                 percent_features, number_features, n_estimators=10, n_jobs=1, random_state=0):
+
+        imputer = SimpleImputer(impute_strategy=impute_strategy)
+        enc = OneHotEncoder()
+        feature_selection = RFClassifierSelectFromModel(n_estimators=n_estimators,
+                                                        max_depth=max_depth,
+                                                        number_features=number_features,
+                                                        percent_features=percent_features,
+                                                        threshold=-np.inf,
+                                                        random_state=random_state)
+        estimator = XGBoostClassifier(random_state=random_state,
+                                      eta=eta,
+                                      max_depth=max_depth,
+                                      min_child_weight=min_child_weight)
+
+        super().__init__(objective=objective,
+                         component_list=[enc, imputer, feature_selection, estimator])
diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -0,0 +1,23 @@
+# flake8:noqa
+from .component_base import ComponentBase
+from .component_types import ComponentTypes
+from .estimators import (
+    Estimator,
+    LinearRegressor,
+    LogisticRegressionClassifier,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    XGBoostClassifier
+)
+from .transformers import (
+    Transformer,
+    OneHotEncoder,
+    RFClassifierSelectFromModel,
+    RFRegressorSelectFromModel,
+    SimpleImputer,
+    StandardScaler,
+    FeatureSelector,
+    CategoricalEncoder
+    )
+
+from .utils import handle_component, str_to_component_type