Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: addressing comments #159

Merged
merged 14 commits into from
Oct 31, 2019
Merged
1 change: 0 additions & 1 deletion evalml/pipelines/classification/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,4 @@ def __init__(self, objective, penalty, C, impute_strategy,

super().__init__(objective=objective,
name=self.name,
problem_type=self.problem_types,
component_list=[enc, imputer, scaler, estimator])
1 change: 0 additions & 1 deletion evalml/pipelines/classification/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,4 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy,

super().__init__(objective=objective,
name=self.name,
problem_type=self.problem_types,
component_list=[enc, imputer, feature_selection, estimator])
1 change: 0 additions & 1 deletion evalml/pipelines/classification/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,4 @@ def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy,

super().__init__(objective=objective,
name=self.name,
problem_type=self.problem_types,
component_list=[enc, imputer, feature_selection, estimator])
5 changes: 4 additions & 1 deletion evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,7 @@ def describe(self, print_name=False, return_dict=False):
parameter_str = ("\t * {} : {}").format(parameter, self.parameters[parameter])
self.logger.log(parameter_str)
if return_dict:
return self.parameters
component_dict = {}
component_dict.update({"name": self.name})
component_dict.update({"parameters": self.parameters})
return component_dict
1 change: 0 additions & 1 deletion evalml/pipelines/components/estimators/estimator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

from evalml.pipelines.components import ComponentBase


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ class FeatureSelector(Transformer):
def get_indices(self):
indices = self._component_obj.get_support(indices=True)
return indices

def get_names(self, all_feature_names):
"""Get names of selected features.

Args:
all_feature_names: feature names

Returns:
list of the names of features selected
"""
indices = self.get_indices()
return list(map(lambda i: all_feature_names[i], indices))
48 changes: 20 additions & 28 deletions evalml/pipelines/pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,21 @@
import pandas as pd
from sklearn.model_selection import train_test_split

from .components import Encoder, Estimator, FeatureSelector, handle_component
from .components import Estimator, handle_component

from evalml.objectives import get_objective
from evalml.utils import Logger


class PipelineBase:
def __init__(self, name, objective, component_list, problem_type=None, n_jobs=-1, random_state=0):
def __init__(self, name, objective, component_list, n_jobs=-1, random_state=0):
"""Machine learning pipeline made out of transformers and a estimator.

Arguments:
objective (Object): the objective to optimize

component_list (list): List of components in order

problem_type (ProblemTypes): Machine learning problem associated with the pipeline

random_state (int): random seed/state

n_jobs (int): Number of jobs to run in parallel
Expand All @@ -29,18 +27,17 @@ def __init__(self, name, objective, component_list, problem_type=None, n_jobs=-1
self.random_state = random_state
self.component_list = [handle_component(component) for component in component_list]
self.component_names = [comp.name for comp in self.component_list]
self.name = self._generate_name() # autogenerated

self.input_feature_names = {}
# check if one and only estimator in pipeline is the last element in component_list
estimator = next((component for component in self.component_list if (isinstance(component, Estimator))), None)
if estimator is not None:
self.estimator = estimator
self.problem_types = estimator.problem_types
self.model_type = estimator.model_type
estimator_index = self.component_list.index(estimator)
self.estimator = next((component for component in self.component_list if (isinstance(component, Estimator))), None)
if self.estimator is not None:
self.problem_types = self.estimator.problem_types
self.model_type = self.estimator.model_type
estimator_index = self.component_list.index(self.estimator)
if estimator_index != len(self.component_list) - 1:
raise RuntimeError("Estimator must be the last component in the pipeline.")

self.name = self._generate_name() # autogenerated
self.results = {}
self.n_jobs = n_jobs
self.parameters = {}
Expand All @@ -64,7 +61,10 @@ def __setitem__(self, index, value):
raise NotImplementedError('Setting pipeline components is not supported.')

def _generate_name(self):
name = "{}".format(self.component_list[-1].name)
if self.estimator is not None:
name = "{}".format(self.estimator.name)
else:
name = "Pipeline"
for index, component in enumerate(self.component_list[:-1]):
if index == 0:
name += " w/ {}".format(component.name)
Expand Down Expand Up @@ -113,11 +113,13 @@ def _fit(self, X, y):
X_t = X
y_t = y
for component in self.component_list[:-1]:
self.input_feature_names.update({component.name: list(pd.DataFrame(X_t))})
if component._needs_fitting:
X_t = component.fit_transform(X_t, y_t)
else:
X_t = component.transform(X_t, y_t)
self.component_list[-1].fit(X_t, y_t)
self.input_feature_names.update({self.estimator.name: list(pd.DataFrame(X_t))})
self.estimator.fit(X_t, y_t)

def fit(self, X, y, objective_fit_size=.2):
"""Build a model
Expand Down Expand Up @@ -146,12 +148,6 @@ def fit(self, X, y, objective_fit_size=.2):

self._fit(X, y)

encoder = next((component for component in self.component_list if (isinstance(component, Encoder))), None)
if encoder is not None:
self.input_feature_names = encoder.get_feature_names()
else:
self.input_feature_names = X.columns.tolist()

if self.objective.needs_fitting:
if self.objective.fit_needs_proba:
y_predicted = self.predict_proba(X_objective)
Expand Down Expand Up @@ -181,14 +177,14 @@ def predict(self, X):
y_predicted = self.predict_proba(X)
else:
X_t = self._transform(X)
y_predicted = self.component_list[-1].predict(X_t)
y_predicted = self.estimator.predict(X_t)

if self.objective.uses_extra_columns:
return self.objective.predict(y_predicted, X)

return self.objective.predict(y_predicted)

return self.component_list[-1].predict(X_t)
return self.estimator.predict(X_t)

def predict_proba(self, X):
"""Make probability estimates for labels.
Expand All @@ -200,7 +196,7 @@ def predict_proba(self, X):
DataFrame : probability estimates
"""
X = self._transform(X)
proba = self.component_list[-1].predict_proba(X)
proba = self.estimator.predict_proba(X)
if proba.shape[1] <= 2:
return proba[:, 1]
else:
Expand Down Expand Up @@ -247,11 +243,7 @@ def score(self, X, y, other_objectives=None):
@property
def feature_importances(self):
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
"""Return feature importances. Feature dropped by feaure selection are excluded"""
feature_selector = next((component for component in self.component_list if (isinstance(component, FeatureSelector))), None)
feature_names = self.input_feature_names
if feature_selector is not None:
indices = feature_selector.get_indices()
feature_names = list(map(lambda i: self.input_feature_names[i], indices))
feature_names = self.input_feature_names[self.estimator.name]
importances = list(zip(feature_names, self.estimator.feature_importances)) # note: this only works for binary
importances.sort(key=lambda x: -abs(x[1]))
df = pd.DataFrame(importances, columns=["feature", "importance"])
Expand Down
1 change: 0 additions & 1 deletion evalml/pipelines/regression/linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,4 @@ def __init__(self, objective, random_state, number_features, impute_strategy, no

super().__init__(objective=objective,
name=self.name,
problem_type=self.problem_types,
component_list=[enc, imputer, scaler, estimator])
1 change: 0 additions & 1 deletion evalml/pipelines/regression/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,4 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_

super().__init__(objective=objective,
name=self.name,
problem_type=self.problem_types,
component_list=[enc, imputer, feature_selection, estimator])
1 change: 0 additions & 1 deletion evalml/problem_types/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def handle_problem_types(problem_type):
Returns:
ProblemTypes
"""

if isinstance(problem_type, str):
try:
tpe = ProblemTypes[problem_type.upper()]
Expand Down
18 changes: 9 additions & 9 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,22 @@ def test_describe_component():
imputer = SimpleImputer("mean")
scaler = StandardScaler()
feature_selection = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=10)
assert enc.describe(return_dict=True) == {}
assert imputer.describe(return_dict=True) == {"impute_strategy": "mean"}
assert scaler.describe(return_dict=True) == {}
assert feature_selection.describe(return_dict=True) == {"percent_features": 0.3, "threshold": 10}
assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {}}
assert imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean'}}
assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
assert feature_selection.describe(return_dict=True) == {'name': 'RF Select From Model', 'parameters': {'percent_features': 0.3, 'threshold': 10}}

# testing estimators
lr_classifier = LogisticRegressionClassifier()
rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3)
rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
linear_regressor = LinearRegressor()
assert lr_classifier.describe(return_dict=True) == {"penalty": "l2", "C": 1.0}
assert rf_classifier.describe(return_dict=True) == {"n_estimators": 10, "max_depth": 3}
assert xgb_classifier.describe(return_dict=True) == {"eta": 0.1, "max_depth": 3, "min_child_weight": 1}
assert rf_regressor.describe(return_dict=True) == {"n_estimators": 10, "max_depth": 3}
assert linear_regressor.describe(return_dict=True) == {"fit_intercept": True, 'normalize': False}
assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'C': 1.0, 'penalty': 'l2'}}
assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'max_depth': 3, 'n_estimators': 10}}
assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1}}
assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'max_depth': 3, 'n_estimators': 10}}
assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False}}


def test_missing_methods_on_components(X_y):
Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/pipeline_tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def __init__(self, objective, penalty, C, impute_strategy,
penalty=penalty,
C=C,
n_jobs=-1)
super().__init__(objective=objective, name=self.name, problem_type=[ProblemTypes.BINARY, ProblemTypes.MULTICLASS], component_list=[enc, imputer, estimator, scaler])
super().__init__(objective=objective, name=self.name, component_list=[enc, imputer, estimator, scaler])

err_msg = "Estimator must be the last component in the pipeline."
with pytest.raises(RuntimeError, match=err_msg):
Expand Down