Merge branch 'main' into 1325_data_checks_returns_dict

alteryx · Nov 19, 2020 · b08447c · b08447c
2 parents c4f12ab + f54abd3
commit b08447c
Show file tree

Hide file tree

Showing 35 changed files with 465 additions and 305 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,9 +3,10 @@ Release Notes
 
 **Future Releases**
     * Enhancements
-        * Added ability to freeze hyperparameters for AutoMLSearch :pr:`1284`
-        * Added `Target Encoder` into transformer components :pr:`1401`
-        * Updated pipelines and ``make_pipeline`` to accept Woodwork DataTables :pr:`1393`
+        * Updated pipelines and ``make_pipeline`` to accept ``Woodwork`` inputs :pr:`1393`
+        * Updated components to accept ``Woodwork`` inputs :pr:`1423`
+        * Added ability to freeze hyperparameters for ``AutoMLSearch`` :pr:`1284`
+        * Added ``Target Encoder`` into transformer components :pr:`1401`
         * Added callback for error handling in ``AutoMLSearch`` :pr:`1403`
         * Added the index id to the ``explain_predictions_best_worst`` output to help users identify which rows in their data are included :pr:`1365`
         * The top_k features displayed in ``explain_predictions_*`` functions are now determined by the magnitude of shap values as opposed to the ``top_k`` largest and smallest shap values. :pr:`1374`

diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py
@@ -41,7 +41,6 @@ def fit(self, X, y):
         """
         X = _convert_to_woodwork_structure(X)
         y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         y = _convert_woodwork_types_wrapper(y.to_series())
         self._encoder.fit(y)
         y = self._encode_targets(y)
@@ -92,8 +91,6 @@ def predict(self, X, objective=None):
         Returns:
             pd.Series : Estimated labels
         """
-        X = _convert_to_woodwork_structure(X)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         predictions = self._predict(X, objective)
         return pd.Series(self._decode_targets(predictions))
 
@@ -106,8 +103,6 @@ def predict_proba(self, X):
         Returns:
             pd.DataFrame: Probability estimates
         """
-        X = _convert_to_woodwork_structure(X)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X = self.compute_estimator_features(X, y=None)
         proba = self.estimator.predict_proba(X)
         proba.columns = self._encoder.classes_
@@ -124,11 +119,8 @@ def score(self, X, y, objectives):
         Returns:
             dict: Ordered dictionary of objective scores
         """
-        X = _convert_to_woodwork_structure(X)
         y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         y = _convert_woodwork_types_wrapper(y.to_series())
-
         objectives = [get_objective(o, return_instance=True) for o in objectives]
         y = self._encode_targets(y)
         y_predicted, y_predicted_proba = self._compute_predictions(X, objectives)

diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -13,6 +13,10 @@
     log_subtitle,
     safe_repr
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 logger = get_logger(__file__)
 
@@ -71,7 +75,7 @@ def clone(self, random_state=0):
         """Constructs a new component with the same parameters
 
         Arguments:
-            random_state (int): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
+            random_state (int, RandomState): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
 
         Returns:
             A new instance of this component with identical parameters
@@ -82,12 +86,17 @@ def fit(self, X, y=None):
         """Fits component to data
 
         Arguments:
-            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
-            y (pd.Series, optional): the target training data of length [n_samples]
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
+            y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]
 
         Returns:
             self
         """
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        if y is not None:
+            y = _convert_to_woodwork_structure(y)
+            y = _convert_woodwork_types_wrapper(y.to_series())
         try:
             self._component_obj.fit(X, y)
             return self
@@ -119,8 +128,8 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL):
         """Saves component at file path
 
         Arguments:
-            file_path (str): location to save file
-            pickle_protocol (int): the pickle data stream format.
+            file_path (str): Location to save file
+            pickle_protocol (int): The pickle data stream format.
 
         Returns:
             None
@@ -133,7 +142,7 @@ def load(file_path):
         """Loads component at file path
 
         Arguments:
-            file_path (str): location to load file
+            file_path (str): Location to load file
 
         Returns:
             ComponentBase object

diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py
@@ -1,5 +1,3 @@
-import pandas as pd
-
 from evalml.exceptions import EnsembleMissingPipelinesError
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components import Estimator
@@ -78,33 +76,8 @@ def default_parameters(cls):
          Returns:
              dict: default parameters for this component.
         """
-        return {'final_estimator': None,
-                'cv': None,
-                'n_jobs': 1,
-                }
-
-    def fit(self, X, y=None):
-        """Fits component to data
-
-        Arguments:
-            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
-            y (pd.Series, optional): the target training data of length [n_samples]
-
-        Returns:
-            self
-        """
-        self._component_obj.fit(X, y)
-        return self
-
-    def predict(self, X):
-        """Make predictions using selected features.
-
-        Arguments:
-            X (pd.DataFrame): Features
-
-        Returns:
-            pd.Series: Predicted values
-        """
-        predictions = self._component_obj.predict(X)
-        predictions = pd.Series(predictions)
-        return predictions
+        return {
+            'final_estimator': None,
+            'cv': None,
+            'n_jobs': 1,
+        }
diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from sklearn.ensemble import StackingClassifier
 from sklearn.model_selection import StratifiedKFold
 
@@ -41,17 +40,3 @@ def __init__(self, input_pipelines=None, final_estimator=None,
         """
         super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator,
                          cv=cv, n_jobs=n_jobs, random_state=random_state, **kwargs)
-
-    def predict_proba(self, X):
-        """Make probability estimates for labels.
-
-        Arguments:
-            X (pd.DataFrame): Features
-
-        Returns:
-            pd.DataFrame: Probability estimates
-        """
-        pred_proba = self._component_obj.predict_proba(X)
-        if not isinstance(pred_proba, pd.DataFrame):
-            pred_proba = pd.DataFrame(pred_proba)
-        return pred_proba
diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py
@@ -5,6 +5,10 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class BaselineClassifier(Estimator):
@@ -40,11 +44,10 @@ def __init__(self, strategy="mode", random_state=0, **kwargs):
     def fit(self, X, y=None):
         if y is None:
             raise ValueError("Cannot fit Baseline classifier if y is None")
-
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         vals, counts = np.unique(y, return_counts=True)
         self._classes = list(vals)
@@ -57,6 +60,8 @@ def fit(self, X, y=None):
         return self
 
     def predict(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         strategy = self.parameters["strategy"]
         if strategy == "mode":
             return pd.Series([self._mode] * len(X))
@@ -66,6 +71,8 @@ def predict(self, X):
             return self.random_state.choice(self._classes, len(X), p=self._percentage_freq)
 
     def predict_proba(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         strategy = self.parameters["strategy"]
         if strategy == "mode":
             mode_index = self._classes.index(self._mode)

diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
@@ -9,7 +9,10 @@
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
-from evalml.utils.gen_utils import categorical_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class CatBoostClassifier(Estimator):
@@ -56,11 +59,11 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
-        cat_cols = X.select_dtypes(categorical_dtypes)
+        X = _convert_to_woodwork_structure(X)
+        cat_cols = list(X.select('category').columns)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         # For binary classification, catboost expects numeric values, so encoding before.
         if y.nunique() <= 2:
@@ -70,6 +73,8 @@ def fit(self, X, y=None):
         return model
 
     def predict(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         predictions = self._component_obj.predict(X)
         if predictions.ndim == 2 and predictions.shape[1] == 1:
             predictions = predictions.flatten()

diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py
@@ -10,7 +10,11 @@
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
-from evalml.utils.gen_utils import categorical_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    _rename_column_names_to_numeric
+)
 
 
 class LightGBMClassifier(Estimator):
@@ -69,42 +73,42 @@ def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, ma
                          random_state=random_seed)
 
     def _encode_categories(self, X, fit=False):
-        X2 = pd.DataFrame(copy.copy(X))
-        # encode each categorical feature as an integer
-        X2.columns = np.arange(len(X2.columns))
-        # necessary to wipe out column names in case any names contain symbols ([, ], <) which LightGBM cannot properly handle
-        cat_cols = X2.select_dtypes(categorical_dtypes).columns
+        """Encodes each categorical feature using ordinal encoding."""
+        X_encoded = _convert_to_woodwork_structure(X)
+        X_encoded = _rename_column_names_to_numeric(X_encoded)
+        cat_cols = list(X_encoded.select('category').columns)
+        X_encoded = _convert_woodwork_types_wrapper(X_encoded.to_dataframe())
         if len(cat_cols) == 0:
-            return X2
+            return X_encoded
         if fit:
             self._ordinal_encoder = OrdinalEncoder()
-            encoder_output = self._ordinal_encoder.fit_transform(X2[cat_cols])
+            encoder_output = self._ordinal_encoder.fit_transform(X_encoded[cat_cols])
         else:
-            encoder_output = self._ordinal_encoder.transform(X2[cat_cols])
-        X2[cat_cols] = pd.DataFrame(encoder_output)
-        X2[cat_cols] = X2[cat_cols].astype('category')
-        return X2
+            encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols])
+        X_encoded[cat_cols] = pd.DataFrame(encoder_output)
+        X_encoded[cat_cols] = X_encoded[cat_cols].astype('category')
+        return X_encoded
 
     def _encode_labels(self, y):
-        y1 = pd.Series(y)
+        y_encoded = pd.Series(y)
         # change only if dtype isn't int
-        if not is_integer_dtype(y1):
+        if not is_integer_dtype(y_encoded):
             self._label_encoder = LabelEncoder()
-            y1 = pd.Series(self._label_encoder.fit_transform(y1), dtype='int64')
-        return y1
+            y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64')
+        return y_encoded
 
     def fit(self, X, y=None):
-        X2 = self._encode_categories(X, fit=True)
-        y2 = self._encode_labels(y)
-        return super().fit(X2, y2)
+        X_encoded = self._encode_categories(X, fit=True)
+        y_encoded = self._encode_labels(y)
+        return super().fit(X_encoded, y_encoded)
 
     def predict(self, X):
-        X2 = self._encode_categories(X)
-        predictions = super().predict(X2)
+        X_encoded = self._encode_categories(X)
+        predictions = super().predict(X_encoded)
         if self._label_encoder:
             predictions = pd.Series(self._label_encoder.inverse_transform(predictions.astype(np.int64)))
         return predictions
 
     def predict_proba(self, X):
-        X2 = self._encode_categories(X)
-        return super().predict_proba(X2)
+        X_encoded = self._encode_categories(X)
+        return super().predict_proba(X_encoded)
diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from skopt.space import Integer, Real
 
 from evalml.model_family import ModelFamily
@@ -42,21 +41,16 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         return super().fit(X, y)
 
     def predict(self, X):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         predictions = super().predict(X)
         return predictions
 
     def predict_proba(self, X):
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         predictions = super().predict_proba(X)
         return predictions