Skip to content

Commit

Permalink
Merge branch 'main' into 1325_data_checks_returns_dict
Browse files Browse the repository at this point in the history
  • Loading branch information
angela97lin committed Nov 19, 2020
2 parents c4f12ab + f54abd3 commit b08447c
Show file tree
Hide file tree
Showing 35 changed files with 465 additions and 305 deletions.
7 changes: 4 additions & 3 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ Release Notes

**Future Releases**
* Enhancements
* Added ability to freeze hyperparameters for AutoMLSearch :pr:`1284`
* Added `Target Encoder` into transformer components :pr:`1401`
* Updated pipelines and ``make_pipeline`` to accept Woodwork DataTables :pr:`1393`
* Updated pipelines and ``make_pipeline`` to accept ``Woodwork`` inputs :pr:`1393`
* Updated components to accept ``Woodwork`` inputs :pr:`1423`
* Added ability to freeze hyperparameters for ``AutoMLSearch`` :pr:`1284`
* Added ``Target Encoder`` into transformer components :pr:`1401`
* Added callback for error handling in ``AutoMLSearch`` :pr:`1403`
* Added the index id to the ``explain_predictions_best_worst`` output to help users identify which rows in their data are included :pr:`1365`
* The top_k features displayed in ``explain_predictions_*`` functions are now determined by the magnitude of shap values as opposed to the ``top_k`` largest and smallest shap values. :pr:`1374`
Expand Down
8 changes: 0 additions & 8 deletions evalml/pipelines/classification_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def fit(self, X, y):
"""
X = _convert_to_woodwork_structure(X)
y = _convert_to_woodwork_structure(y)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_woodwork_types_wrapper(y.to_series())
self._encoder.fit(y)
y = self._encode_targets(y)
Expand Down Expand Up @@ -92,8 +91,6 @@ def predict(self, X, objective=None):
Returns:
pd.Series : Estimated labels
"""
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
predictions = self._predict(X, objective)
return pd.Series(self._decode_targets(predictions))

Expand All @@ -106,8 +103,6 @@ def predict_proba(self, X):
Returns:
pd.DataFrame: Probability estimates
"""
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
X = self.compute_estimator_features(X, y=None)
proba = self.estimator.predict_proba(X)
proba.columns = self._encoder.classes_
Expand All @@ -124,11 +119,8 @@ def score(self, X, y, objectives):
Returns:
dict: Ordered dictionary of objective scores
"""
X = _convert_to_woodwork_structure(X)
y = _convert_to_woodwork_structure(y)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_woodwork_types_wrapper(y.to_series())

objectives = [get_objective(o, return_instance=True) for o in objectives]
y = self._encode_targets(y)
y_predicted, y_predicted_proba = self._compute_predictions(X, objectives)
Expand Down
21 changes: 15 additions & 6 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
log_subtitle,
safe_repr
)
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)

logger = get_logger(__file__)

Expand Down Expand Up @@ -71,7 +75,7 @@ def clone(self, random_state=0):
"""Constructs a new component with the same parameters
Arguments:
random_state (int): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
random_state (int, RandomState): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
Returns:
A new instance of this component with identical parameters
Expand All @@ -82,12 +86,17 @@ def fit(self, X, y=None):
"""Fits component to data
Arguments:
X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
y (pd.Series, optional): the target training data of length [n_samples]
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]
Returns:
self
"""
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
if y is not None:
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())
try:
self._component_obj.fit(X, y)
return self
Expand Down Expand Up @@ -119,8 +128,8 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL):
"""Saves component at file path
Arguments:
file_path (str): location to save file
pickle_protocol (int): the pickle data stream format.
file_path (str): Location to save file
pickle_protocol (int): The pickle data stream format.
Returns:
None
Expand All @@ -133,7 +142,7 @@ def load(file_path):
"""Loads component at file path
Arguments:
file_path (str): location to load file
file_path (str): Location to load file
Returns:
ComponentBase object
Expand Down
37 changes: 5 additions & 32 deletions evalml/pipelines/components/ensemble/stacked_ensemble_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pandas as pd

from evalml.exceptions import EnsembleMissingPipelinesError
from evalml.model_family import ModelFamily
from evalml.pipelines.components import Estimator
Expand Down Expand Up @@ -78,33 +76,8 @@ def default_parameters(cls):
Returns:
dict: default parameters for this component.
"""
return {'final_estimator': None,
'cv': None,
'n_jobs': 1,
}

def fit(self, X, y=None):
"""Fits component to data
Arguments:
X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
y (pd.Series, optional): the target training data of length [n_samples]
Returns:
self
"""
self._component_obj.fit(X, y)
return self

def predict(self, X):
"""Make predictions using selected features.
Arguments:
X (pd.DataFrame): Features
Returns:
pd.Series: Predicted values
"""
predictions = self._component_obj.predict(X)
predictions = pd.Series(predictions)
return predictions
return {
'final_estimator': None,
'cv': None,
'n_jobs': 1,
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold

Expand Down Expand Up @@ -41,17 +40,3 @@ def __init__(self, input_pipelines=None, final_estimator=None,
"""
super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator,
cv=cv, n_jobs=n_jobs, random_state=random_state, **kwargs)

def predict_proba(self, X):
"""Make probability estimates for labels.
Arguments:
X (pd.DataFrame): Features
Returns:
pd.DataFrame: Probability estimates
"""
pred_proba = self._component_obj.predict_proba(X)
if not isinstance(pred_proba, pd.DataFrame):
pred_proba = pd.DataFrame(pred_proba)
return pred_proba
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class BaselineClassifier(Estimator):
Expand Down Expand Up @@ -40,11 +44,10 @@ def __init__(self, strategy="mode", random_state=0, **kwargs):
def fit(self, X, y=None):
if y is None:
raise ValueError("Cannot fit Baseline classifier if y is None")

if not isinstance(y, pd.Series):
y = pd.Series(y)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

vals, counts = np.unique(y, return_counts=True)
self._classes = list(vals)
Expand All @@ -57,6 +60,8 @@ def fit(self, X, y=None):
return self

def predict(self, X):
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
strategy = self.parameters["strategy"]
if strategy == "mode":
return pd.Series([self._mode] * len(X))
Expand All @@ -66,6 +71,8 @@ def predict(self, X):
return self.random_state.choice(self._classes, len(X), p=self._percentage_freq)

def predict_proba(self, X):
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
strategy = self.parameters["strategy"]
if strategy == "mode":
mode_index = self._classes.index(self._mode)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
from evalml.utils.gen_utils import categorical_dtypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class CatBoostClassifier(Estimator):
Expand Down Expand Up @@ -56,11 +59,11 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
random_state=random_state)

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if not isinstance(y, pd.Series):
y = pd.Series(y)
cat_cols = X.select_dtypes(categorical_dtypes)
X = _convert_to_woodwork_structure(X)
cat_cols = list(X.select('category').columns)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

# For binary classification, catboost expects numeric values, so encoding before.
if y.nunique() <= 2:
Expand All @@ -70,6 +73,8 @@ def fit(self, X, y=None):
return model

def predict(self, X):
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
predictions = self._component_obj.predict(X)
if predictions.ndim == 2 and predictions.shape[1] == 1:
predictions = predictions.flatten()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
from evalml.utils.gen_utils import categorical_dtypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper,
_rename_column_names_to_numeric
)


class LightGBMClassifier(Estimator):
Expand Down Expand Up @@ -69,42 +73,42 @@ def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, ma
random_state=random_seed)

def _encode_categories(self, X, fit=False):
X2 = pd.DataFrame(copy.copy(X))
# encode each categorical feature as an integer
X2.columns = np.arange(len(X2.columns))
# necessary to wipe out column names in case any names contain symbols ([, ], <) which LightGBM cannot properly handle
cat_cols = X2.select_dtypes(categorical_dtypes).columns
"""Encodes each categorical feature using ordinal encoding."""
X_encoded = _convert_to_woodwork_structure(X)
X_encoded = _rename_column_names_to_numeric(X_encoded)
cat_cols = list(X_encoded.select('category').columns)
X_encoded = _convert_woodwork_types_wrapper(X_encoded.to_dataframe())
if len(cat_cols) == 0:
return X2
return X_encoded
if fit:
self._ordinal_encoder = OrdinalEncoder()
encoder_output = self._ordinal_encoder.fit_transform(X2[cat_cols])
encoder_output = self._ordinal_encoder.fit_transform(X_encoded[cat_cols])
else:
encoder_output = self._ordinal_encoder.transform(X2[cat_cols])
X2[cat_cols] = pd.DataFrame(encoder_output)
X2[cat_cols] = X2[cat_cols].astype('category')
return X2
encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols])
X_encoded[cat_cols] = pd.DataFrame(encoder_output)
X_encoded[cat_cols] = X_encoded[cat_cols].astype('category')
return X_encoded

def _encode_labels(self, y):
y1 = pd.Series(y)
y_encoded = pd.Series(y)
# change only if dtype isn't int
if not is_integer_dtype(y1):
if not is_integer_dtype(y_encoded):
self._label_encoder = LabelEncoder()
y1 = pd.Series(self._label_encoder.fit_transform(y1), dtype='int64')
return y1
y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64')
return y_encoded

def fit(self, X, y=None):
X2 = self._encode_categories(X, fit=True)
y2 = self._encode_labels(y)
return super().fit(X2, y2)
X_encoded = self._encode_categories(X, fit=True)
y_encoded = self._encode_labels(y)
return super().fit(X_encoded, y_encoded)

def predict(self, X):
X2 = self._encode_categories(X)
predictions = super().predict(X2)
X_encoded = self._encode_categories(X)
predictions = super().predict(X_encoded)
if self._label_encoder:
predictions = pd.Series(self._label_encoder.inverse_transform(predictions.astype(np.int64)))
return predictions

def predict_proba(self, X):
X2 = self._encode_categories(X)
return super().predict_proba(X2)
X_encoded = self._encode_categories(X)
return super().predict_proba(X_encoded)
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pandas as pd
from skopt.space import Integer, Real

from evalml.model_family import ModelFamily
Expand Down Expand Up @@ -42,21 +41,16 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r
random_state=random_state)

def fit(self, X, y=None):
# rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
if isinstance(X, pd.DataFrame):
X = _rename_column_names_to_numeric(X)
X = _rename_column_names_to_numeric(X)
return super().fit(X, y)

def predict(self, X):
# rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
if isinstance(X, pd.DataFrame):
X = _rename_column_names_to_numeric(X)
X = _rename_column_names_to_numeric(X)
predictions = super().predict(X)
return predictions

def predict_proba(self, X):
if isinstance(X, pd.DataFrame):
X = _rename_column_names_to_numeric(X)
X = _rename_column_names_to_numeric(X)
predictions = super().predict_proba(X)
return predictions

Expand Down

0 comments on commit b08447c

Please sign in to comment.