Skip to content

Commit

Permalink
WIP: Pipeline v2 (#108)
Browse files Browse the repository at this point in the history
* adding skeleton for component_base

* oops, added nested components folders, fixing

* linting

* adding skeleton estimator/transfomer classes

* adding basic estimator components for merge merge

* WIP: Components (#107)

* Added imputer

* Clean up imputer

* added onehot and standard scaler

* Need fix selectfrommodel and validating

* Add selectfrommodel

* Cleanup and added basic init test

* lint

* cleaning up, more merging, combining tests

* adding componenttype enum, a little more merging

* linting

* Moved to estimator

* lint fix :P

* pipeline v2 base

* pushing new base for pipeline and simple test

* Faulty scaler for LR

* Working LR

* Broken RF

* Fixed SelectFromModel

* Added RF pipeline

Fixed SelectFromModel

* changing xgboost to use our components

* Added RF Regression

* beginning to fix broken pieces

* continuing to fix tests

* fixing tests in autoclassifier

* linting

* fixing silly typo bug

* linting

* cleaning up pipeline and pipelinebase classes

* adding describe to components and pipeline

* linting and fixing minor bug

* adding check for estimator in pipeline

* Pipeline indexing (#118)

* Added indexing and basic tests

* Switched to pipelinebase for slicing

* Clean up and add docstrings

* lint

* lint again

* Clean up and add error for setting

* adding default value to next() to prevent StopIteration error

* oops, actually fixing...

* fixing docstrings and cleaning imports

* adding simple tests for describe

* linting

* Autogenerated pipeline names (#122)

* Basic name without check

* Add assert

* lint

* Changed name format and name constants

* Cleanup

* moving files to subfolders and removing hyperparameters as class var

* updating file hierarchy

* linting

* removing duplicate

* Add feature_importance tests

* adding extra tests

* adding test, cleanup

* Added linear regression and test for pipeling fitting (#131)

* Added linear regression pipeline and added test for fitting

* Remove unnecessary fit for xgboost component  (#148)

* Remove fit for xgboost

* Remove kwargs

* addressing pr comments: rename, abstract feature_importances, del __init__, cleanup, etc. (#149)

* addressing pr comments: rename, del __init__, cleanup

* cleaning up describe

* feature importances + cleanup of components, added subclasses encoder + feature_selector

* linting, fixing errors

* adding less specific version

* addressing comments

* import errors

* String and component_type component (#153)

* added handling str and component type for component list

* Adding model_type / problem_type to PipelineBase to allow inference (#157)

* adding problem_type and model_type to pipeline base

* problem_type --> problem_types

* cleaning up self.component_list and init pipeline, typo

* forgot to remove print

* removing comments

* changelog

* removing generic SelectFromModel

* Jeremy changes (#164)

* cleanup components utils

* Switch to category_encoder and cleanup RF Select

* add feature_importance to estimator

* More switching to CE

* Move parameters to class attributes and make test

* lint

* Separate changelog test

* Text cleanup

* wip: addressing comments (#159)

* wip: addressing comments

* feature names?

* adding fix for no estimator in generate_name

* fixing test

* addressing more comments on feature_importance

* feature_importances fixed and cleaned :)

* oops, missed merge conflict

* change of name

* minor cleanup

* adding basic test for two feature selectors and adding default components in DEFAULT_COMPONENTS

* adding tests for retaining feature names in input_feature_names

* addressing comments

* cleanup

* fixing
  • Loading branch information
angela97lin committed Nov 4, 2019
1 parent e412fdb commit 4a58a11
Show file tree
Hide file tree
Showing 47 changed files with 1,471 additions and 213 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,4 @@ venv.bak/

# mypy
.mypy_cache/
.DS_Store
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Changelog
* Added support for unlimited pipelines with a max_time limit :pr:`70`
* Fixes
* Changes
* Refactoring pipelines :pr:`108`
* Documentation Changes
* Testing Changes

Expand Down
2 changes: 0 additions & 2 deletions evalml/models/auto_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ def _do_iteration(self, X, y, pbar, raise_errors):

# propose the next best parameters for this piepline
parameters = self._propose_parameters(pipeline_class)

# fit an score the pipeline
pipeline = pipeline_class(
objective=self.objective,
Expand Down Expand Up @@ -199,7 +198,6 @@ def _do_iteration(self, X, y, pbar, raise_errors):
try:
pipeline.fit(X_train, y_train)
score, other_scores = pipeline.score(X_test, y_test, other_objectives=self.additional_objectives)

except Exception as e:
if raise_errors:
raise e
Expand Down
20 changes: 19 additions & 1 deletion evalml/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,29 @@
# flake8:noqa
from .components import (
Estimator,
OneHotEncoder,
SimpleImputer,
StandardScaler,
Transformer,
LinearRegressor,
LogisticRegressionClassifier,
RandomForestClassifier,
RandomForestRegressor,
XGBoostClassifier,
ComponentTypes,
FeatureSelector,
CategoricalEncoder,
RFClassifierSelectFromModel,
RFRegressorSelectFromModel
)

from .pipeline_base import PipelineBase
from .classification import (
LogisticRegressionPipeline,
RFClassificationPipeline,
XGBoostPipeline
)
from .regression import RFRegressionPipeline
from .regression import LinearRegressionPipeline, RFRegressionPipeline
from .utils import (
get_pipelines,
list_model_types,
Expand Down
63 changes: 19 additions & 44 deletions evalml/pipelines/classification/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skopt.space import Real

from evalml.model_types import ModelTypes
from evalml.pipelines import PipelineBase
from evalml.pipelines.components import (
LogisticRegressionClassifier,
OneHotEncoder,
SimpleImputer,
StandardScaler
)
from evalml.problem_types import ProblemTypes


class LogisticRegressionPipeline(PipelineBase):
"""Logistic Regression Pipeline for both binary and multiclass classification"""
name = "LogisticRegression w/ imputation + scaling"
name = "Logistic Regression Classifier w/ One Hot Encoder + Simple Imputer + Standard Scaler"
model_type = ModelTypes.LINEAR_MODEL
problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

Expand All @@ -25,39 +24,15 @@ class LogisticRegressionPipeline(PipelineBase):
}

def __init__(self, objective, penalty, C, impute_strategy,
number_features, n_jobs=1, random_state=0):
imputer = SimpleImputer(strategy=impute_strategy)
enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)

estimator = LogisticRegression(random_state=random_state,
penalty=penalty,
C=C,
multi_class='auto',
solver="lbfgs",
n_jobs=-1)

self.pipeline = Pipeline(
[("encoder", enc),
("imputer", imputer),
("scaler", StandardScaler()),
("estimator", estimator)]
)

super().__init__(objective=objective, random_state=random_state)

@property
def feature_importances(self):
"""Return feature importances. Feature dropped by feaure selection are excluded"""
coef_ = self.pipeline["estimator"].coef_

# binary classification case
if len(coef_) <= 2:
importances = list(zip(self.input_feature_names, coef_[0]))
importances.sort(key=lambda x: -abs(x[1]))
else:
# mutliclass classification case
importances = list(zip(self.input_feature_names, np.linalg.norm(coef_, axis=0, ord=2)))
importances.sort(key=lambda x: -(x[1]))

df = pd.DataFrame(importances, columns=["feature", "importance"])
return df
number_features, n_jobs=-1, random_state=0):

imputer = SimpleImputer(impute_strategy=impute_strategy)
enc = OneHotEncoder()
scaler = StandardScaler()
estimator = LogisticRegressionClassifier(random_state=random_state,
penalty=penalty,
C=C,
n_jobs=-1)

super().__init__(objective=objective,
component_list=[enc, imputer, scaler, estimator])
59 changes: 21 additions & 38 deletions evalml/pipelines/classification/random_forest.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from skopt.space import Integer, Real

from evalml.model_types import ModelTypes
from evalml.pipelines import PipelineBase
from evalml.pipelines.components import (
OneHotEncoder,
RandomForestClassifier,
RFClassifierSelectFromModel,
SimpleImputer
)
from evalml.problem_types import ProblemTypes


class RFClassificationPipeline(PipelineBase):
"""Random Forest Pipeline for both binary and multiclass classification"""
name = "Random Forest w/ imputation"
name = "Random Forest Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model"
model_type = ModelTypes.RANDOM_FOREST
problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

Expand All @@ -27,36 +27,19 @@ class RFClassificationPipeline(PipelineBase):

def __init__(self, objective, n_estimators, max_depth, impute_strategy,
percent_features, number_features, n_jobs=1, random_state=0):
imputer = SimpleImputer(strategy=impute_strategy)
enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)

estimator = RandomForestClassifier(random_state=random_state,
n_estimators=n_estimators,
imputer = SimpleImputer(impute_strategy=impute_strategy)
enc = OneHotEncoder()
estimator = RandomForestClassifier(n_estimators=n_estimators,
max_depth=max_depth,
n_jobs=n_jobs)

feature_selection = SelectFromModel(
estimator=estimator,
max_features=max(1, int(percent_features * number_features)),
threshold=-np.inf
)

self.pipeline = Pipeline(
[("encoder", enc),
("imputer", imputer),
("feature_selection", feature_selection),
("estimator", estimator)]
)

super().__init__(objective=objective, random_state=random_state)

@property
def feature_importances(self):
"""Return feature importances. Feature dropped by feaure selection are excluded"""
indices = self.pipeline["feature_selection"].get_support(indices=True)
feature_names = list(map(lambda i: self.input_feature_names[i], indices))
importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_))
importances.sort(key=lambda x: -abs(x[1]))

df = pd.DataFrame(importances, columns=["feature", "importance"])
return df
n_jobs=n_jobs,
random_state=random_state)
feature_selection = RFClassifierSelectFromModel(n_estimators=n_estimators,
max_depth=max_depth,
number_features=number_features,
percent_features=percent_features,
threshold=-np.inf,
random_state=random_state)

super().__init__(objective=objective,
component_list=[enc, imputer, feature_selection, estimator])
96 changes: 24 additions & 72 deletions evalml/pipelines/classification/xgboost.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from skopt.space import Integer, Real
from xgboost import XGBClassifier

from evalml.model_types import ModelTypes
from evalml.pipelines import PipelineBase
from evalml.pipelines.components import (
OneHotEncoder,
RFClassifierSelectFromModel,
SimpleImputer,
XGBoostClassifier
)
from evalml.problem_types import ProblemTypes


class XGBoostPipeline(PipelineBase):
"""XGBoost Pipeline for both binary and multiclass classification"""
name = "XGBoost w/ imputation"
name = "XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model"
model_type = ModelTypes.XGBOOST
problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

Expand All @@ -27,68 +27,20 @@ class XGBoostPipeline(PipelineBase):
}

def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy,
percent_features, number_features, n_jobs=1, random_state=0):
imputer = SimpleImputer(strategy=impute_strategy)
enc = ce.OneHotEncoder(use_cat_names=True, return_df=True)

estimator = XGBClassifier(
random_state=random_state,
eta=eta,
max_depth=max_depth,
min_child_weight=min_child_weight
)

feature_selection = SelectFromModel(
estimator=estimator,
max_features=max(1, int(percent_features * number_features)),
threshold=-np.inf
)

self.pipeline = Pipeline(
[("encoder", enc),
("imputer", imputer),
("feature_selection", feature_selection),
("estimator", estimator)]
)

super().__init__(objective=objective, random_state=random_state)

# Need to override fit for multiclass
def fit(self, X, y, objective_fit_size=.2):
"""Build a model
Arguments:
X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
y (pd.Series): the target training labels of length [n_samples]
Returns:
self
"""
# check if problem is multiclass
num_classes = len(np.unique(y))
if num_classes > 2:
params = self.pipeline['estimator'].get_params()
params.update(
{
"objective": 'multi:softprob',
"num_class": num_classes
})

estimator = XGBClassifier(**params)
self.pipeline.steps[-1] = ('estimator', estimator)

return super().fit(X, y, objective_fit_size)

@property
def feature_importances(self):
"""Return feature importances. Feature dropped by feaure selection are excluded"""
indices = self.pipeline["feature_selection"].get_support(indices=True)
feature_names = list(map(lambda i: self.input_feature_names[i], indices))
importances = list(zip(feature_names, self.pipeline["estimator"].feature_importances_))
importances.sort(key=lambda x: -abs(x[1]))

df = pd.DataFrame(importances, columns=["feature", "importance"])
return df
percent_features, number_features, n_estimators=10, n_jobs=1, random_state=0):

imputer = SimpleImputer(impute_strategy=impute_strategy)
enc = OneHotEncoder()
feature_selection = RFClassifierSelectFromModel(n_estimators=n_estimators,
max_depth=max_depth,
number_features=number_features,
percent_features=percent_features,
threshold=-np.inf,
random_state=random_state)
estimator = XGBoostClassifier(random_state=random_state,
eta=eta,
max_depth=max_depth,
min_child_weight=min_child_weight)

super().__init__(objective=objective,
component_list=[enc, imputer, feature_selection, estimator])
23 changes: 23 additions & 0 deletions evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# flake8:noqa
from .component_base import ComponentBase
from .component_types import ComponentTypes
from .estimators import (
Estimator,
LinearRegressor,
LogisticRegressionClassifier,
RandomForestClassifier,
RandomForestRegressor,
XGBoostClassifier
)
from .transformers import (
Transformer,
OneHotEncoder,
RFClassifierSelectFromModel,
RFRegressorSelectFromModel,
SimpleImputer,
StandardScaler,
FeatureSelector,
CategoricalEncoder
)

from .utils import handle_component, str_to_component_type
Loading

0 comments on commit 4a58a11

Please sign in to comment.