From f54abd3fe2e0f95b2e92c387e5ca575d868eb006 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Wed, 18 Nov 2020 20:12:04 -0500 Subject: [PATCH] Update components to accept Woodwork inputs (#1423) --- docs/source/release_notes.rst | 7 +- evalml/pipelines/classification_pipeline.py | 8 -- evalml/pipelines/components/component_base.py | 21 +++- .../ensemble/stacked_ensemble_base.py | 37 +----- .../ensemble/stacked_ensemble_classifier.py | 15 --- .../classifiers/baseline_classifier.py | 17 ++- .../classifiers/catboost_classifier.py | 17 ++- .../classifiers/lightgbm_classifier.py | 50 ++++---- .../classifiers/xgboost_classifier.py | 12 +- .../components/estimators/estimator.py | 12 +- .../regressors/baseline_regressor.py | 13 +- .../regressors/catboost_regressor.py | 18 +-- .../regressors/xgboost_regressor.py | 9 +- .../transformers/column_selectors.py | 11 +- .../dimensionality_reduction/pca.py | 18 +-- .../transformers/encoders/onehot_encoder.py | 57 +++++---- .../transformers/imputers/imputer.py | 23 ++-- .../imputers/per_column_imputer.py | 28 +++-- .../transformers/imputers/simple_imputer.py | 24 ++-- .../preprocessing/datetime_featurizer.py | 17 +-- .../preprocessing/drop_null_columns.py | 19 +-- .../transformers/preprocessing/lsa.py | 16 ++- .../preprocessing/text_featurizer.py | 12 +- evalml/pipelines/pipeline_base.py | 13 +- evalml/pipelines/regression_pipeline.py | 6 - .../test_baseline_classifier.py | 11 +- .../test_datetime_featurizer.py | 2 +- evalml/tests/component_tests/test_imputer.py | 21 ++-- evalml/tests/component_tests/test_lsa.py | 14 --- .../component_tests/test_one_hot_encoder.py | 26 +++- evalml/tests/component_tests/test_pca.py | 10 +- .../component_tests/test_simple_imputer.py | 28 +++-- evalml/tests/utils_tests/test_gen_utils.py | 118 ++++++++++++++++++ evalml/utils/__init__.py | 1 + evalml/utils/gen_utils.py | 59 ++++++--- 35 files changed, 465 insertions(+), 305 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 881daf72b0..f8a4e33cc3 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,9 +3,10 @@ Release Notes **Future Releases** * Enhancements - * Added ability to freeze hyperparameters for AutoMLSearch :pr:`1284` - * Added `Target Encoder` into transformer components :pr:`1401` - * Updated pipelines and ``make_pipeline`` to accept Woodwork DataTables :pr:`1393` + * Updated pipelines and ``make_pipeline`` to accept ``Woodwork`` inputs :pr:`1393` + * Updated components to accept ``Woodwork`` inputs :pr:`1423` + * Added ability to freeze hyperparameters for ``AutoMLSearch`` :pr:`1284` + * Added ``Target Encoder`` into transformer components :pr:`1401` * Added callback for error handling in ``AutoMLSearch`` :pr:`1403` * Added the index id to the ``explain_predictions_best_worst`` output to help users identify which rows in their data are included :pr:`1365` * The top_k features displayed in ``explain_predictions_*`` functions are now determined by the magnitude of shap values as opposed to the ``top_k`` largest and smallest shap values. :pr:`1374` diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index c03ffca278..e0b759efc6 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -41,7 +41,6 @@ def fit(self, X, y): """ X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) self._encoder.fit(y) y = self._encode_targets(y) @@ -92,8 +91,6 @@ def predict(self, X, objective=None): Returns: pd.Series : Estimated labels """ - X = _convert_to_woodwork_structure(X) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) predictions = self._predict(X, objective) return pd.Series(self._decode_targets(predictions)) @@ -106,8 +103,6 @@ def predict_proba(self, X): Returns: pd.DataFrame: Probability estimates """ - X = _convert_to_woodwork_structure(X) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) X = self.compute_estimator_features(X, y=None) proba = self.estimator.predict_proba(X) proba.columns = self._encoder.classes_ @@ -124,11 +119,8 @@ def score(self, X, y, objectives): Returns: dict: Ordered dictionary of objective scores """ - X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) - objectives = [get_objective(o, return_instance=True) for o in objectives] y = self._encode_targets(y) y_predicted, y_predicted_proba = self._compute_predictions(X, objectives) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index d15e17af7e..df9bd48c91 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -13,6 +13,10 @@ log_subtitle, safe_repr ) +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) logger = get_logger(__file__) @@ -71,7 +75,7 @@ def clone(self, random_state=0): """Constructs a new component with the same parameters Arguments: - random_state (int): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0. + random_state (int, RandomState): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0. Returns: A new instance of this component with identical parameters @@ -82,12 +86,17 @@ def fit(self, X, y=None): """Fits component to data Arguments: - X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] - y (pd.Series, optional): the target training data of length [n_samples] + X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] + y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples] Returns: self """ + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + if y is not None: + y = _convert_to_woodwork_structure(y) + y = _convert_woodwork_types_wrapper(y.to_series()) try: self._component_obj.fit(X, y) return self @@ -119,8 +128,8 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves component at file path Arguments: - file_path (str): location to save file - pickle_protocol (int): the pickle data stream format. + file_path (str): Location to save file + pickle_protocol (int): The pickle data stream format. Returns: None @@ -133,7 +142,7 @@ def load(file_path): """Loads component at file path Arguments: - file_path (str): location to load file + file_path (str): Location to load file Returns: ComponentBase object diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py index 852381d96d..025bc52daf 100644 --- a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py @@ -1,5 +1,3 @@ -import pandas as pd - from evalml.exceptions import EnsembleMissingPipelinesError from evalml.model_family import ModelFamily from evalml.pipelines.components import Estimator @@ -78,33 +76,8 @@ def default_parameters(cls): Returns: dict: default parameters for this component. """ - return {'final_estimator': None, - 'cv': None, - 'n_jobs': 1, - } - - def fit(self, X, y=None): - """Fits component to data - - Arguments: - X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] - y (pd.Series, optional): the target training data of length [n_samples] - - Returns: - self - """ - self._component_obj.fit(X, y) - return self - - def predict(self, X): - """Make predictions using selected features. - - Arguments: - X (pd.DataFrame): Features - - Returns: - pd.Series: Predicted values - """ - predictions = self._component_obj.predict(X) - predictions = pd.Series(predictions) - return predictions + return { + 'final_estimator': None, + 'cv': None, + 'n_jobs': 1, + } diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py index f8cb9b6b09..96f88e2672 100644 --- a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py +++ b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py @@ -1,4 +1,3 @@ -import pandas as pd from sklearn.ensemble import StackingClassifier from sklearn.model_selection import StratifiedKFold @@ -41,17 +40,3 @@ def __init__(self, input_pipelines=None, final_estimator=None, """ super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator, cv=cv, n_jobs=n_jobs, random_state=random_state, **kwargs) - - def predict_proba(self, X): - """Make probability estimates for labels. - - Arguments: - X (pd.DataFrame): Features - - Returns: - pd.DataFrame: Probability estimates - """ - pred_proba = self._component_obj.predict_proba(X) - if not isinstance(pred_proba, pd.DataFrame): - pred_proba = pd.DataFrame(pred_proba) - return pred_proba diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 6c7f5f2f22..81dbfb897e 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -5,6 +5,10 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class BaselineClassifier(Estimator): @@ -40,11 +44,10 @@ def __init__(self, strategy="mode", random_state=0, **kwargs): def fit(self, X, y=None): if y is None: raise ValueError("Cannot fit Baseline classifier if y is None") - - if not isinstance(y, pd.Series): - y = pd.Series(y) - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + y = _convert_to_woodwork_structure(y) + y = _convert_woodwork_types_wrapper(y.to_series()) vals, counts = np.unique(y, return_counts=True) self._classes = list(vals) @@ -57,6 +60,8 @@ def fit(self, X, y=None): return self def predict(self, X): + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) strategy = self.parameters["strategy"] if strategy == "mode": return pd.Series([self._mode] * len(X)) @@ -66,6 +71,8 @@ def predict(self, X): return self.random_state.choice(self._classes, len(X), p=self._percentage_freq) def predict_proba(self, X): + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) strategy = self.parameters["strategy"] if strategy == "mode": mode_index = self._classes.index(self._mode) diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index b2fddb00b6..691507a998 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -9,7 +9,10 @@ from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise -from evalml.utils.gen_utils import categorical_dtypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class CatBoostClassifier(Estimator): @@ -56,11 +59,11 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, random_state=random_state) def fit(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - if not isinstance(y, pd.Series): - y = pd.Series(y) - cat_cols = X.select_dtypes(categorical_dtypes) + X = _convert_to_woodwork_structure(X) + cat_cols = list(X.select('category').columns) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + y = _convert_to_woodwork_structure(y) + y = _convert_woodwork_types_wrapper(y.to_series()) # For binary classification, catboost expects numeric values, so encoding before. if y.nunique() <= 2: @@ -70,6 +73,8 @@ def fit(self, X, y=None): return model def predict(self, X): + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) predictions = self._component_obj.predict(X) if predictions.ndim == 2 and predictions.shape[1] == 1: predictions = predictions.flatten() diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index 7066f58aa1..4700fe48e1 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -10,7 +10,11 @@ from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise -from evalml.utils.gen_utils import categorical_dtypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper, + _rename_column_names_to_numeric +) class LightGBMClassifier(Estimator): @@ -69,42 +73,42 @@ def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, ma random_state=random_seed) def _encode_categories(self, X, fit=False): - X2 = pd.DataFrame(copy.copy(X)) - # encode each categorical feature as an integer - X2.columns = np.arange(len(X2.columns)) - # necessary to wipe out column names in case any names contain symbols ([, ], <) which LightGBM cannot properly handle - cat_cols = X2.select_dtypes(categorical_dtypes).columns + """Encodes each categorical feature using ordinal encoding.""" + X_encoded = _convert_to_woodwork_structure(X) + X_encoded = _rename_column_names_to_numeric(X_encoded) + cat_cols = list(X_encoded.select('category').columns) + X_encoded = _convert_woodwork_types_wrapper(X_encoded.to_dataframe()) if len(cat_cols) == 0: - return X2 + return X_encoded if fit: self._ordinal_encoder = OrdinalEncoder() - encoder_output = self._ordinal_encoder.fit_transform(X2[cat_cols]) + encoder_output = self._ordinal_encoder.fit_transform(X_encoded[cat_cols]) else: - encoder_output = self._ordinal_encoder.transform(X2[cat_cols]) - X2[cat_cols] = pd.DataFrame(encoder_output) - X2[cat_cols] = X2[cat_cols].astype('category') - return X2 + encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols]) + X_encoded[cat_cols] = pd.DataFrame(encoder_output) + X_encoded[cat_cols] = X_encoded[cat_cols].astype('category') + return X_encoded def _encode_labels(self, y): - y1 = pd.Series(y) + y_encoded = pd.Series(y) # change only if dtype isn't int - if not is_integer_dtype(y1): + if not is_integer_dtype(y_encoded): self._label_encoder = LabelEncoder() - y1 = pd.Series(self._label_encoder.fit_transform(y1), dtype='int64') - return y1 + y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64') + return y_encoded def fit(self, X, y=None): - X2 = self._encode_categories(X, fit=True) - y2 = self._encode_labels(y) - return super().fit(X2, y2) + X_encoded = self._encode_categories(X, fit=True) + y_encoded = self._encode_labels(y) + return super().fit(X_encoded, y_encoded) def predict(self, X): - X2 = self._encode_categories(X) - predictions = super().predict(X2) + X_encoded = self._encode_categories(X) + predictions = super().predict(X_encoded) if self._label_encoder: predictions = pd.Series(self._label_encoder.inverse_transform(predictions.astype(np.int64))) return predictions def predict_proba(self, X): - X2 = self._encode_categories(X) - return super().predict_proba(X2) + X_encoded = self._encode_categories(X) + return super().predict_proba(X_encoded) diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index 165bd0d7b8..b937fc8e9f 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -1,4 +1,3 @@ -import pandas as pd from skopt.space import Integer, Real from evalml.model_family import ModelFamily @@ -42,21 +41,16 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r random_state=random_state) def fit(self, X, y=None): - # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle - if isinstance(X, pd.DataFrame): - X = _rename_column_names_to_numeric(X) + X = _rename_column_names_to_numeric(X) return super().fit(X, y) def predict(self, X): - # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle - if isinstance(X, pd.DataFrame): - X = _rename_column_names_to_numeric(X) + X = _rename_column_names_to_numeric(X) predictions = super().predict(X) return predictions def predict_proba(self, X): - if isinstance(X, pd.DataFrame): - X = _rename_column_names_to_numeric(X) + X = _rename_column_names_to_numeric(X) predictions = super().predict_proba(X) return predictions diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 252ae2a605..1ad9e98455 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -4,6 +4,10 @@ from evalml.exceptions import MethodPropertyNotFoundError from evalml.pipelines.components import ComponentBase +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class Estimator(ComponentBase): @@ -28,12 +32,14 @@ def predict(self, X): """Make predictions using selected features. Arguments: - X (pd.DataFrame): Features + X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] Returns: pd.Series: Predicted values """ try: + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) predictions = self._component_obj.predict(X) except AttributeError: raise MethodPropertyNotFoundError("Estimator requires a predict method or a component_obj that implements predict") @@ -45,12 +51,14 @@ def predict_proba(self, X): """Make probability estimates for labels. Arguments: - X (pd.DataFrame): Features + X (ww.DataTable, pd.DataFrame, or np.ndarray): Features Returns: pd.DataFrame: Probability estimates """ try: + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) pred_proba = self._component_obj.predict_proba(X) except AttributeError: raise MethodPropertyNotFoundError("Estimator requires a predict_proba method or a component_obj that implements predict_proba") diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index ee31b20526..2b46addd99 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -4,6 +4,10 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class BaselineRegressor(Estimator): @@ -38,9 +42,10 @@ def __init__(self, strategy="mean", random_state=0, **kwargs): def fit(self, X, y=None): if y is None: raise ValueError("Cannot fit Baseline regressor if y is None") - - if not isinstance(y, pd.Series): - y = pd.Series(y) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + y = _convert_to_woodwork_structure(y) + y = _convert_woodwork_types_wrapper(y.to_series()) if self.parameters["strategy"] == "mean": self._prediction_value = y.mean() @@ -50,6 +55,8 @@ def fit(self, X, y=None): return self def predict(self, X): + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) return pd.Series([self._prediction_value] * len(X)) @property diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index 042eb32979..505cff948a 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -1,13 +1,15 @@ import copy -import pandas as pd from skopt.space import Integer, Real from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise -from evalml.utils.gen_utils import categorical_dtypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class CatBoostRegressor(Estimator): @@ -53,11 +55,13 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, random_state=random_state) def fit(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - if not isinstance(y, pd.Series): - y = pd.Series(y) - cat_cols = X.select_dtypes(categorical_dtypes) + X = _convert_to_woodwork_structure(X) + cat_cols = list(X.select('category').columns) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + + y = _convert_to_woodwork_structure(y) + y = _convert_woodwork_types_wrapper(y.to_series()) + model = self._component_obj.fit(X, y, silent=True, cat_features=cat_cols) return model diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index b620b68715..6d2e5b94b7 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -1,4 +1,3 @@ -import pandas as pd from skopt.space import Integer, Real from evalml.model_family import ModelFamily @@ -42,15 +41,11 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r random_state=random_state) def fit(self, X, y=None): - # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle - if isinstance(X, pd.DataFrame): - X = _rename_column_names_to_numeric(X) + X = _rename_column_names_to_numeric(X) return super().fit(X, y) def predict(self, X): - # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle - if isinstance(X, pd.DataFrame): - X = _rename_column_names_to_numeric(X) + X = _rename_column_names_to_numeric(X) predictions = super().predict(X) return predictions diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index b71cef566d..1e176c8f1d 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -1,9 +1,12 @@ from abc import abstractmethod import numpy as np -import pandas as pd from evalml.pipelines.components.transformers import Transformer +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class ColumnSelector(Transformer): @@ -56,10 +59,8 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._check_input_for_columns(X) cols = self.parameters.get("columns") or [] diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index e4a8974eec..b5e935c8d9 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -3,7 +3,11 @@ from skopt.space import Real from evalml.pipelines.components.transformers import Transformer -from evalml.utils.gen_utils import is_all_numeric +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper, + is_all_numeric +) class PCA(Transformer): @@ -33,8 +37,8 @@ def __init__(self, variance=0.95, n_components=None, random_state=0, **kwargs): random_state=random_state) def fit(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") @@ -42,8 +46,8 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") @@ -51,8 +55,8 @@ def transform(self, X, y=None): return pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) def fit_transform(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index d80c266201..a095334439 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -3,9 +3,12 @@ import pandas as pd from sklearn.preprocessing import OneHotEncoder as SKOneHotEncoder -from ..transformer import Transformer - from evalml.pipelines.components import ComponentBaseMeta +from evalml.pipelines.components.transformers.transformer import Transformer +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class OneHotEncoderMeta(ComponentBaseMeta): @@ -82,20 +85,16 @@ def _get_cat_cols(X): def fit(self, X, y=None): top_n = self.parameters['top_n'] - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_t = X - if self.features_to_encode is None: self.features_to_encode = self._get_cat_cols(X_t) invalid_features = [col for col in self.features_to_encode if col not in list(X.columns)] if len(invalid_features) > 0: raise ValueError("Could not find and encode {} in input data.".format(', '.join(invalid_features))) - if self.parameters['handle_missing'] == "as_category": - X_t[self.features_to_encode] = X_t[self.features_to_encode].replace(np.nan, "nan") - elif self.parameters['handle_missing'] == "error" and X.isnull().any().any(): - raise ValueError("Input contains NaN") + X_t = self._handle_parameter_handle_missing(X_t) if len(self.features_to_encode) == 0: categories = 'auto' @@ -137,35 +136,41 @@ def transform(self, X, y=None): Returns: Transformed dataframe, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ - - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - - cat_cols = self.features_to_encode - - if self.parameters['handle_missing'] == "as_category": - X[cat_cols] = X[cat_cols].replace(np.nan, "nan") - if self.parameters['handle_missing'] == "error" and X.isnull().any().any(): - raise ValueError("Input contains NaN") + X_copy = _convert_to_woodwork_structure(X) + X_copy = _convert_woodwork_types_wrapper(X_copy.to_dataframe()) + X_copy = self._handle_parameter_handle_missing(X_copy) X_t = pd.DataFrame() # Add the non-categorical columns, untouched - for col in X.columns: - if col not in cat_cols: - X_t = pd.concat([X_t, X[col]], axis=1) + for col in X_copy.columns: + if col not in self.features_to_encode: + X_t = pd.concat([X_t, X_copy[col]], axis=1) # The call to pd.concat above changes the type of the index so we will manually keep it the same. if not X_t.empty: - X_t.index = X.index + X_t.index = X_copy.index # Call sklearn's transform on the categorical columns - if len(cat_cols) > 0: - X_cat = pd.DataFrame(self._encoder.transform(X[cat_cols]).toarray(), index=X.index) - cat_cols_str = [str(c) for c in cat_cols] + if len(self.features_to_encode) > 0: + X_cat = pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(), index=X_copy.index) + cat_cols_str = [str(c) for c in self.features_to_encode] X_cat.columns = self._encoder.get_feature_names(input_features=cat_cols_str) X_t = pd.concat([X_t, X_cat], axis=1) return X_t + def _handle_parameter_handle_missing(self, X): + """Helper method to handle the `handle_missing` parameter.""" + cat_cols = self.features_to_encode + if self.parameters['handle_missing'] == "error" and X.isnull().any().any(): + raise ValueError("Input contains NaN") + if self.parameters['handle_missing'] == "as_category": + for col in cat_cols: + if X[col].dtype == 'category' and pd.isna(X[col]).any(): + X[col] = X[col].cat.add_categories("nan") + X[col] = X[col].where(~pd.isna(X[col]), other='nan') + X[cat_cols] = X[cat_cols].replace(np.nan, "nan") + return X + def categories(self, feature_name): """Returns a list of the unique categories to be encoded for the particular feature, in order. diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index 9dd0df8285..82c9dbd0aa 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -1,8 +1,9 @@ -import pandas as pd - from evalml.pipelines.components.transformers import Transformer from evalml.pipelines.components.transformers.imputers import SimpleImputer -from evalml.utils.gen_utils import boolean, categorical_dtypes, numeric_dtypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class Imputer(Transformer): @@ -62,19 +63,22 @@ def fit(self, X, y=None): Returns: self """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + cat_cols = list(X.select('category').columns) + numeric_cols = list(X.select('numeric').columns) + + X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns) X_copy = X.copy() X_null_dropped = X_copy.drop(self._all_null_cols, axis=1, errors='ignore') - X_numerics = X_null_dropped.select_dtypes(include=numeric_dtypes) + X_numerics = X_null_dropped[[col for col in numeric_cols if col not in self._all_null_cols]] if len(X_numerics.columns) > 0: self._numeric_imputer.fit(X_numerics, y) self._numeric_cols = X_numerics.columns - X_categorical = X_null_dropped.select_dtypes(include=categorical_dtypes + boolean) + X_categorical = X_null_dropped[[col for col in cat_cols if col not in self._all_null_cols]] if len(X_categorical.columns) > 0: self._categorical_imputer.fit(X_categorical, y) self._categorical_cols = X_categorical.columns @@ -91,8 +95,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) + X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') X_null_dropped.reset_index(inplace=True, drop=True) diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index af4ffccb74..8ddf18bc37 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -1,9 +1,11 @@ -import pandas as pd - from evalml.pipelines.components.transformers import Transformer from evalml.pipelines.components.transformers.imputers.simple_imputer import ( SimpleImputer ) +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class PerColumnImputer(Transformer): @@ -43,14 +45,14 @@ def fit(self, X, y=None): """Fits imputers on input data Arguments: - X (pd.DataFrame): Data to fit - y (pd.Series, optional): Ignored. + X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. + y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: self """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) self.imputers = dict() for column in X.columns: strategy_dict = self.impute_strategies.get(column, dict()) @@ -64,17 +66,17 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - """Transforms input data by imputing missing values + """Transforms input data by imputing missing values. Arguments: - X (pd.DataFrame): Data to transform - y (pd.Series, optional): Ignored. + X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. + y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: pd.DataFrame: Transformed X """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_t = X.copy() cols_to_drop = [] for column, imputer in self.imputers.items(): @@ -90,8 +92,8 @@ def fit_transform(self, X, y=None): """Fits imputer and imputes missing values in input data. Arguments: - X (pd.DataFrame): Data to fit and transform - y (pd.Series): Target data. + X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. + y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: pd.DataFrame: Transformed X diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 6b7d42403a..55cc268671 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -3,6 +3,10 @@ from sklearn.impute import SimpleImputer as SkImputer from evalml.pipelines.components.transformers import Transformer +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class SimpleImputer(Transformer): @@ -35,14 +39,14 @@ def fit(self, X, y=None): treated as the same. Arguments: - X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] - y (pd.Series, optional): the target training data of length [n_samples] + X (ww.DataTable, pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] + y (ww.DataColumn, pd.Series, optional): the target training data of length [n_samples] Returns: self """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) # Convert None to np.nan, since None cannot be properly handled X = X.fillna(value=np.nan) @@ -55,14 +59,14 @@ def transform(self, X, y=None): treated as the same. Arguments: - X (pd.DataFrame): Data to transform - y (pd.Series, optional): Ignored. + X (ww.DataTable, pd.DataFrame): Data to transform + y (ww.DataColumn, pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) # Convert None to np.nan, since None cannot be properly handled X = X.fillna(value=np.nan) @@ -81,8 +85,8 @@ def fit_transform(self, X, y=None): """Fits on X and transforms X Arguments: - X (pd.DataFrame): Data to fit and transform - y (pd. DataFrame): Target data. + X (ww.DataTable, pd.DataFrame): Data to fit and transform + y (ww.DataColumn, pd.Series, optional): Target data. Returns: pd.DataFrame: Transformed X diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index 39f3114f62..4ccb6c42ad 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -1,7 +1,9 @@ -import pandas as pd - from evalml.pipelines.components.transformers import Transformer -from evalml.utils.gen_utils import datetime_dtypes +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper, + datetime_dtypes +) def _extract_year(col): @@ -51,8 +53,8 @@ def __init__(self, features_to_extract=None, random_state=0, **kwargs): random_state=random_state) def fit(self, X, y=None): - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._date_time_col_names = X.select_dtypes(include=datetime_dtypes).columns return self @@ -66,10 +68,9 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_t = X - if not isinstance(X_t, pd.DataFrame): - X_t = pd.DataFrame(X_t) features_to_extract = self.parameters["features_to_extract"] if len(features_to_extract) == 0: return X_t diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index b9cbe33e6b..9ba7981b5c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -1,6 +1,8 @@ -import pandas as pd - from evalml.pipelines.components.transformers import Transformer +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class DropNullColumns(Transformer): @@ -28,9 +30,9 @@ def __init__(self, pct_null_threshold=1.0, random_state=0, **kwargs): def fit(self, X, y=None): pct_null_threshold = self.parameters["pct_null_threshold"] - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - percent_null = X.isnull().mean() + X_t = _convert_to_woodwork_structure(X) + X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe()) + percent_null = X_t.isnull().mean() if pct_null_threshold == 0.0: null_cols = percent_null[percent_null > 0] else: @@ -48,7 +50,6 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) - return X.drop(columns=self._cols_to_drop, axis=1) + X_t = _convert_to_woodwork_structure(X) + X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe()) + return X_t.drop(columns=self._cols_to_drop, axis=1) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 9871f147d9..2cce495e8d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -6,6 +6,10 @@ from evalml.pipelines.components.transformers.preprocessing import ( TextTransformer ) +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class LSA(TextTransformer): @@ -28,8 +32,8 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): def fit(self, X, y=None): if len(self._all_text_columns) == 0: return self - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) text_columns = self._get_text_columns(X) corpus = X[text_columns].values.flatten() # we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard. @@ -41,15 +45,15 @@ def transform(self, X, y=None): """Transforms data X by applying the LSA pipeline. Arguments: - X (pd.DataFrame): Data to transform - y (pd.Series, optional): Ignored. + X (ww.DataTable, pd.DataFrame): Data to transform + y (ww.DataColumn, pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) if len(self._all_text_columns) == 0: return X diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index ff74c2f4f7..a58385018e 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -8,6 +8,10 @@ LSA, TextTransformer ) +from evalml.utils.gen_utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper +) class TextFeaturizer(TextTransformer): @@ -70,8 +74,8 @@ def fit(self, X, y=None): """ if len(self._all_text_columns) == 0: return self - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) text_columns = self._get_text_columns(X) es = self._make_entity_set(X, text_columns) @@ -92,8 +96,8 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X) + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) if self._features is None or len(self._features) == 0: return X diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 1dbc3caa63..c0a6ee1312 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -20,6 +20,7 @@ ) from evalml.pipelines.pipeline_base_meta import PipelineBaseMeta from evalml.utils import ( + _convert_to_woodwork_structure, check_random_state_equality, classproperty, get_logger, @@ -30,10 +31,6 @@ log_title, safe_repr ) -from evalml.utils.gen_utils import ( - _convert_to_woodwork_structure, - _convert_woodwork_types_wrapper -) logger = get_logger(__file__) @@ -197,16 +194,13 @@ def compute_estimator_features(self, X, y=None): def _compute_features_during_fit(self, X, y): X_t = X for component in self.component_graph[:-1]: - self.input_feature_names.update({component.name: list(pd.DataFrame(X_t))}) + self.input_feature_names.update({component.name: list(X_t.columns)}) X_t = component.fit_transform(X_t, y=y) - - self.input_feature_names.update({self.estimator.name: list(pd.DataFrame(X_t))}) - + self.input_feature_names.update({self.estimator.name: list(X_t.columns)}) return X_t def _fit(self, X, y): X_t = self._compute_features_during_fit(X, y) - self.estimator.fit(X_t, y) @abstractmethod @@ -233,7 +227,6 @@ def predict(self, X, objective=None): pd.Series: Predicted values. """ X = _convert_to_woodwork_structure(X) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) X_t = self.compute_estimator_features(X, y=None) return self.estimator.predict(X_t) diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 66b14e3640..7c28087bfe 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -26,7 +26,6 @@ def fit(self, X, y): """ X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) if y.dtype not in numeric_dtypes: raise ValueError(f"Regression pipeline cannot handle targets with dtype: {y.dtype}") @@ -44,11 +43,6 @@ def score(self, X, y, objectives): Returns: dict: Ordered dictionary of objective scores """ - X = _convert_to_woodwork_structure(X) - y = _convert_to_woodwork_structure(y) - X = _convert_woodwork_types_wrapper(X.to_dataframe()) - y = _convert_woodwork_types_wrapper(y.to_series()) - objectives = [get_objective(o, return_instance=True) for o in objectives] y_predicted = self.predict(X) return self._score_all_objectives(X, y, y_predicted, y_pred_proba=None, objectives=objectives) diff --git a/evalml/tests/component_tests/test_baseline_classifier.py b/evalml/tests/component_tests/test_baseline_classifier.py index 081996de1c..3333e49fed 100644 --- a/evalml/tests/component_tests/test_baseline_classifier.py +++ b/evalml/tests/component_tests/test_baseline_classifier.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +import woodwork as ww from evalml.model_family import ModelFamily from evalml.pipelines.components import BaselineClassifier @@ -25,15 +26,19 @@ def test_baseline_y_is_None(X_y_binary): BaselineClassifier().fit(X, y=None) -def test_baseline_binary_mode(X_y_binary): +@pytest.mark.parametrize('data_type', ['pd', 'ww']) +def test_baseline_binary_mode(data_type, X_y_binary): X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) y = pd.Series([10, 11, 10, 10]) + if data_type == 'ww': + X = ww.DataTable(X) + y = ww.DataColumn(y) clf = BaselineClassifier(strategy="mode") clf.fit(X, y) assert clf.classes_ == [10, 11] - np.testing.assert_allclose(clf.predict(X), np.array([10] * len(X))) + np.testing.assert_allclose(clf.predict(X), np.array([10] * X.shape[0])) predicted_proba = clf.predict_proba(X) - assert predicted_proba.shape == (len(X), 2) + assert predicted_proba.shape == (X.shape[0], 2) expected_predicted_proba = pd.DataFrame({10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.]}) pd.testing.assert_frame_equal(expected_predicted_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) diff --git a/evalml/tests/component_tests/test_datetime_featurizer.py b/evalml/tests/component_tests/test_datetime_featurizer.py index 1e1b056982..a01c75927f 100644 --- a/evalml/tests/component_tests/test_datetime_featurizer.py +++ b/evalml/tests/component_tests/test_datetime_featurizer.py @@ -77,6 +77,6 @@ def test_datetime_featurizer_no_datetime_cols(): def test_datetime_featurizer_numpy_array_input(): datetime_transformer = DateTimeFeaturizer() - X = np.array(['2007-02-03', '2016-06-07', '2020-05-19'], dtype='datetime64') + X = np.array([['2007-02-03'], ['2016-06-07'], ['2020-05-19']], dtype='datetime64') datetime_transformer.fit(X) assert list(datetime_transformer.transform(X).columns) == ["0_year", "0_month", "0_day_of_week", "0_hour"] diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index 2cf7cbe764..4401562a88 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +import woodwork as ww from pandas.testing import assert_frame_equal from evalml.pipelines.components import Imputer @@ -95,10 +96,10 @@ def test_categorical_only_input(imputer_test_data): transformed = imputer.transform(X, y) expected = pd.DataFrame({ "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "object col": ["b", "b", "a", "c", "d"], + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), "bool col": [True, False, False, True, True], "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'), - "object with nan": ["b", "b", "b", "c", "b"], + "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'), "bool col with nan": [True, True, False, True, True] }) @@ -116,13 +117,13 @@ def test_categorical_and_numeric_input(imputer_test_data): expected = pd.DataFrame({ "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), "int col": [0, 1, 2, 0, 3], - "object col": ["b", "b", "a", "c", "d"], + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), "float col": [0.0, 1.0, 0.0, -2.0, 5.], "bool col": [True, False, False, True, True], "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'), "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], "float with nan": [0.0, 1.0, 0, -1.0, 0.], - "object with nan": ["b", "b", "b", "c", "b"], + "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'), "bool col with nan": [True, True, False, True, True] }) assert_frame_equal(transformed, expected, check_dtype=False) @@ -181,12 +182,16 @@ def test_imputer_datetime_input(): assert_frame_equal(transformed, X, check_dtype=False) -@pytest.mark.parametrize("data_type", ['np', 'pd']) +@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) def test_imputer_empty_data(data_type): if data_type == 'pd': X = pd.DataFrame() y = pd.Series() expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([])) + elif data_type == 'ww': + X = ww.DataTable(pd.DataFrame()) + y = ww.DataColumn(pd.Series()) + expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([])) else: X = np.array([[]]) y = np.array([]) @@ -234,7 +239,7 @@ def test_imputer_fill_value(imputer_test_data): "int with nan": [-1, 1, 0, 0, 1], "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), "float with nan": [0.0, 1.0, -1, -1.0, 0.], - "object with nan": ["b", "b", "fill", "c", "fill"], + "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), "bool col with nan": [True, "fill", False, "fill", True] }) assert_frame_equal(transformed, expected, check_dtype=False) @@ -254,7 +259,7 @@ def test_imputer_no_nans(imputer_test_data): transformed = imputer.transform(X, y) expected = pd.DataFrame({ "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "object col": ["b", "b", "a", "c", "d"], + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), "bool col": [True, False, False, True, True], }) assert_frame_equal(transformed, expected, check_dtype=False) @@ -280,7 +285,7 @@ def test_imputer_with_none(): "float with None": [0.1, 0.0, 0.5, 0.2], "category with None": pd.Series(["b", "a", "a", "a"], dtype='category'), "boolean with None": [True, True, False, True], - "object with None": ["b", "a", "a", "a"]}) + "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')}) assert_frame_equal(transformed, expected, check_dtype=False) imputer = Imputer() diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index 5c86a16f0f..22435736d1 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -152,20 +152,6 @@ def test_int_col_names(): assert X_t.dtypes.all() == np.float64 -def test_repeat_col_names(): - X = pd.DataFrame(data=np.array([['identical string one', 'identical string one'], - ['second double string', 'second double string'], - ['copy the third', 'copy the third']]), columns=['col_1', 'col_1']) - lsa = LSA(text_columns=['col_1', 'col_1']) - lsa.fit(X) - expected_col_names = ['LSA(col_1)[0]', - 'LSA(col_1)[1]'] - X_t = lsa.transform(X) - np.testing.assert_array_equal(X_t.columns, np.array(expected_col_names)) - assert len(X_t.columns) == 2 - assert X_t.dtypes.all() == np.float64 - - def test_lsa_output(): X = pd.DataFrame( {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index cf9549eccd..22308e7d95 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -1,10 +1,15 @@ import numpy as np import pandas as pd import pytest +import woodwork as ww from evalml.exceptions import ComponentNotYetFittedError from evalml.pipelines.components import OneHotEncoder -from evalml.utils import get_random_state +from evalml.utils import ( + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper, + get_random_state +) def test_init(): @@ -147,7 +152,6 @@ def test_no_top_n(): "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"], "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"], "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2]}) - expected_col_names = set(["col_3_a", "col_3_b", "col_4"]) for val in X["col_1"]: expected_col_names.add("col_1_" + val) @@ -228,6 +232,10 @@ def test_more_top_n_unique_values(): encoder = OneHotEncoder(top_n=5, random_state=random_seed) encoder.fit(X) X_t = encoder.transform(X) + + # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') @@ -260,6 +268,10 @@ def test_more_top_n_unique_values_large(): encoder = OneHotEncoder(top_n=3, random_state=random_seed) encoder.fit(X) X_t = encoder.transform(X) + + # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too + X = _convert_to_woodwork_structure(X) + X = _convert_woodwork_types_wrapper(X.to_dataframe()) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') @@ -310,7 +322,7 @@ def test_numpy_input(): encoder = OneHotEncoder() encoder.fit(X) X_t = encoder.transform(X) - assert pd.DataFrame(X).equals(X_t) + pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False) def test_large_number_of_categories(): @@ -329,16 +341,18 @@ def test_large_number_of_categories(): assert set(expected_col_names) == set(list(X_t.columns)) -@pytest.mark.parametrize('data_type', ['list', 'np', 'pd_no_index', 'pd_index']) +@pytest.mark.parametrize('data_type', ['list', 'np', 'pd_no_index', 'pd_index', 'ww']) def test_data_types(data_type): if data_type == 'list': - X = ["a", "b", "c"] + X = [["a"], ["b"], ["c"]] elif data_type == 'np': - X = np.array(["a", "b", "c"]) + X = np.array([["a"], ["b"], ["c"]]) elif data_type == 'pd_no_index': X = pd.DataFrame(["a", "b", "c"]) elif data_type == 'pd_index': X = pd.DataFrame(["a", "b", "c"], columns=['0']) + elif data_type == 'ww': + X = ww.DataTable(pd.DataFrame(["a", "b", "c"])) encoder = OneHotEncoder() encoder.fit(X) X_t = encoder.transform(X) diff --git a/evalml/tests/component_tests/test_pca.py b/evalml/tests/component_tests/test_pca.py index d58a158c15..e0748dff33 100644 --- a/evalml/tests/component_tests/test_pca.py +++ b/evalml/tests/component_tests/test_pca.py @@ -24,11 +24,11 @@ def test_pca_numeric(): def test_pca_array(): - X = [[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]] + X = np.array([[3, 0, 1, 6], + [1, 2, 1, 6], + [10, 2, 1, 6], + [10, 2, 2, 5], + [6, 2, 2, 5]]) pca = PCA() expected_X_t = pd.DataFrame([[3.176246, 1.282616], [4.969987, -0.702976], diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 780baf8b3f..298b2361bc 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -6,7 +6,7 @@ from evalml.pipelines.components import SimpleImputer -def test_median(): +def test_simple_imputer_median(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [10, 2, np.nan, 2], @@ -22,7 +22,7 @@ def test_median(): assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -def test_mean(): +def test_simple_imputer_mean(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [1, 2, 3, 0]]) @@ -35,7 +35,7 @@ def test_mean(): assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -def test_constant(): +def test_simple_imputer_constant(): # test impute strategy is constant and fill value is not specified X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], @@ -45,11 +45,12 @@ def test_constant(): X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3], ["b", 2, 3, 0]]) + X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -def test_most_frequent(): +def test_simple_imputer_most_frequent(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 1, 0]]) @@ -58,11 +59,12 @@ def test_most_frequent(): X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3], ["b", 2, 1, 0]]) + X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -def test_col_with_non_numeric(): +def test_simple_imputer_col_with_non_numeric(): # test col with all strings X = pd.DataFrame([["a", 0, 1, np.nan], ["b", 2, 3, 3], @@ -86,6 +88,7 @@ def test_col_with_non_numeric(): ["b", 2, 3, 3], ["a", 2, 3, 1], ["a", 2, 3, 0]]) + X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) @@ -94,11 +97,12 @@ def test_col_with_non_numeric(): ["b", 2, 3, 3], ["a", 2, 3, 1], [2, 2, 3, 0]]) + X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -def test_fit_transform_drop_all_nan_columns(): +def test_simple_imputer_fit_transform_drop_all_nan_columns(): X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2]}) @@ -112,7 +116,7 @@ def test_fit_transform_drop_all_nan_columns(): "another_col": [0, 1, 2]})) -def test_transform_drop_all_nan_columns(): +def test_simple_imputer_transform_drop_all_nan_columns(): X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2]}) @@ -125,7 +129,7 @@ def test_transform_drop_all_nan_columns(): "another_col": [0, 1, 2]})) -def test_transform_drop_all_nan_columns_empty(): +def test_simple_imputer_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) transformer = SimpleImputer(impute_strategy='most_frequent') assert transformer.fit_transform(X).empty @@ -137,7 +141,7 @@ def test_transform_drop_all_nan_columns_empty(): assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) -def test_numpy_input(): +def test_simple_imputer_numpy_input(): X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) @@ -171,7 +175,7 @@ def test_simple_imputer_fill_value(data_type): fill_value = "fill" expected = pd.DataFrame({ "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), - "object with nan": ["b", "b", "fill", "c", "fill"], + "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), }) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) @@ -226,6 +230,6 @@ def test_simple_imputer_with_none(): imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'), - "boolean with None": [True, True, False, True], - "object with None": ["b", "a", "a", "a"]}) + "boolean with None": pd.Series([True, True, False, True], dtype='category'), + "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')}) assert_frame_equal(transformed, expected, check_dtype=False) diff --git a/evalml/tests/utils_tests/test_gen_utils.py b/evalml/tests/utils_tests/test_gen_utils.py index cdb8722159..e9ebc68f55 100644 --- a/evalml/tests/utils_tests/test_gen_utils.py +++ b/evalml/tests/utils_tests/test_gen_utils.py @@ -4,10 +4,14 @@ import numpy as np import pandas as pd import pytest +import woodwork as ww from evalml.pipelines.components import ComponentBase from evalml.utils.gen_utils import ( SEED_BOUNDS, + _convert_to_woodwork_structure, + _convert_woodwork_types_wrapper, + _rename_column_names_to_numeric, check_random_state_equality, classproperty, convert_to_seconds, @@ -273,3 +277,117 @@ def test_drop_nan(data, expected): no_nan_1, no_nan_2 = drop_rows_with_nans(*data) _check_equality(no_nan_1, expected[0], check_index_type=False) _check_equality(no_nan_2, expected[1], check_index_type=False) + + +def test_rename_column_names_to_numeric(): + X = np.array([[1, 2], [3, 4]]) + pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame(X)) + + X = pd.DataFrame({"<>": [1, 2], ">>": [2, 4]}) + pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame({0: [1, 2], 1: [2, 4]})) + + X = ww.DataTable(pd.DataFrame({"<>": [1, 2], ">>": [2, 4]}), logical_types={"<>": "categorical", ">>": "categorical"}) + X_renamed = _rename_column_names_to_numeric(X) + X_expected = pd.DataFrame({0: pd.Series([1, 2], dtype="category"), 1: pd.Series([2, 4], dtype="category")}) + pd.testing.assert_frame_equal(X_renamed.to_dataframe(), X_expected) + assert X_renamed.logical_types == {0: ww.logical_types.Categorical, 1: ww.logical_types.Categorical} + + +def test_convert_woodwork_types_wrapper_with_nan(): + y = _convert_woodwork_types_wrapper(pd.Series([1, 2, None], dtype="Int64")) + pd.testing.assert_series_equal(y, pd.Series([1, 2, np.nan], dtype="float64")) + + y = _convert_woodwork_types_wrapper(pd.array([1, 2, None], dtype="Int64")) + pd.testing.assert_series_equal(y, pd.Series([1, 2, np.nan], dtype="float64")) + + y = _convert_woodwork_types_wrapper(pd.Series(["a", "b", None], dtype="string")) + pd.testing.assert_series_equal(y, pd.Series(["a", "b", np.nan], dtype="object")) + + y = _convert_woodwork_types_wrapper(pd.array(["a", "b", None], dtype="string")) + pd.testing.assert_series_equal(y, pd.Series(["a", "b", np.nan], dtype="object")) + + y = _convert_woodwork_types_wrapper(pd.Series([True, False, None], dtype="boolean")) + pd.testing.assert_series_equal(y, pd.Series([True, False, np.nan])) + + y = _convert_woodwork_types_wrapper(pd.array([True, False, None], dtype="boolean")) + pd.testing.assert_series_equal(y, pd.Series([True, False, np.nan])) + + +def test_convert_woodwork_types_wrapper(): + y = _convert_woodwork_types_wrapper(pd.Series([1, 2, 3], dtype="Int64")) + pd.testing.assert_series_equal(y, pd.Series([1, 2, 3], dtype="int64")) + + y = _convert_woodwork_types_wrapper(pd.array([1, 2, 3], dtype="Int64")) + pd.testing.assert_series_equal(y, pd.Series([1, 2, 3], dtype="int64")) + + y = _convert_woodwork_types_wrapper(pd.Series(["a", "b", "a"], dtype="string")) + pd.testing.assert_series_equal(y, pd.Series(["a", "b", "a"], dtype="object")) + + y = _convert_woodwork_types_wrapper(pd.array(["a", "b", "a"], dtype="string")) + pd.testing.assert_series_equal(y, pd.Series(["a", "b", "a"], dtype="object")) + + y = _convert_woodwork_types_wrapper(pd.Series([True, False, True], dtype="boolean")) + pd.testing.assert_series_equal(y, pd.Series([True, False, True], dtype="bool")) + + y = _convert_woodwork_types_wrapper(pd.array([True, False, True], dtype="boolean")) + pd.testing.assert_series_equal(y, pd.Series([True, False, True], dtype="bool")) + + +def test_convert_woodwork_types_wrapper_dataframe(): + X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"), + "Int array": pd.array([1, 2, 3], dtype="Int64"), + "Int series with nan": pd.Series([1, 2, None], dtype="Int64"), + "Int array with nan": pd.array([1, 2, None], dtype="Int64"), + "string series": pd.Series(["a", "b", "a"], dtype="string"), + "string array": pd.array(["a", "b", "a"], dtype="string"), + "string series with nan": pd.Series(["a", "b", None], dtype="string"), + "string array with nan": pd.array(["a", "b", None], dtype="string"), + "boolean series": pd.Series([True, False, True], dtype="boolean"), + "boolean array": pd.array([True, False, True], dtype="boolean"), + "boolean series with nan": pd.Series([True, False, None], dtype="boolean"), + "boolean array with nan": pd.array([True, False, None], dtype="boolean") + }) + X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"), + "Int array": pd.array([1, 2, 3], dtype="int64"), + "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"), + "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"), + "string series": pd.Series(["a", "b", "a"], dtype="object"), + "string array": pd.array(["a", "b", "a"], dtype="object"), + "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"), + "string array with nan": pd.array(["a", "b", np.nan], dtype="object"), + "boolean series": pd.Series([True, False, True], dtype="bool"), + "boolean array": pd.array([True, False, True], dtype="bool"), + "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"), + "boolean array with nan": pd.array([True, False, np.nan], dtype="object") + }) + pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X)) + + +def test_convert_to_woodwork_structure(): + X_dt = ww.DataTable(pd.DataFrame([[1, 2], [3, 4]])) + pd.testing.assert_frame_equal(X_dt.to_dataframe(), _convert_to_woodwork_structure(X_dt).to_dataframe()) + + X_dc = ww.DataColumn(pd.Series([1, 2, 3, 4])) + pd.testing.assert_series_equal(X_dc.to_series(), _convert_to_woodwork_structure(X_dc).to_series()) + + X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="Int64"), + 1: pd.Series([3, 4], dtype="Int64")}) + pd.testing.assert_frame_equal(X_pd, _convert_to_woodwork_structure(X_pd).to_dataframe()) + + X_pd = pd.Series([1, 2, 3, 4], dtype="Int64") + pd.testing.assert_series_equal(X_pd, _convert_to_woodwork_structure(X_pd).to_series()) + + X_list = [1, 2, 3, 4] + X_expected = ww.DataColumn(pd.Series(X_list)) + pd.testing.assert_series_equal(X_expected.to_series(), _convert_to_woodwork_structure(X_list).to_series()) + assert X_list == [1, 2, 3, 4] + + X_np = np.array([1, 2, 3, 4]) + X_expected = ww.DataColumn(pd.Series(X_np)) + pd.testing.assert_series_equal(X_expected.to_series(), _convert_to_woodwork_structure(X_np).to_series()) + assert np.array_equal(X_np, np.array([1, 2, 3, 4])) + + X_np = np.array([[1, 2], [3, 4]]) + X_expected = ww.DataTable(pd.DataFrame(X_np)) + pd.testing.assert_frame_equal(X_expected.to_dataframe(), _convert_to_woodwork_structure(X_np).to_dataframe()) + assert np.array_equal(X_np, np.array([[1, 2], [3, 4]])) diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py index b76533e8f6..36bdba3281 100644 --- a/evalml/utils/__init__.py +++ b/evalml/utils/__init__.py @@ -10,6 +10,7 @@ jupyter_check, safe_repr, _convert_woodwork_types_wrapper, + _convert_to_woodwork_structure, drop_rows_with_nans, pad_with_nans ) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 92b87516f9..d04b1c07c1 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -210,7 +210,7 @@ def get_importable_subclasses(base_class, used_in_automl=True): def _rename_column_names_to_numeric(X): - """Used in XGBoost classifier and regressor classes to rename column names + """Used in LightGBM classifier class and XGBoost classifier and regressor classes to rename column names when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot natively handle. Arguments: @@ -219,8 +219,18 @@ def _rename_column_names_to_numeric(X): Returns: Transformed X where column names are renamed to numerical values """ - name_to_col_num = dict((col, col_num) for col_num, col in enumerate(X.columns.values)) - return X.rename(columns=name_to_col_num, inplace=False) + X_t = X + if isinstance(X, np.ndarray): + return pd.DataFrame(X) + if isinstance(X, ww.DataTable): + X_t = X.to_dataframe() + logical_types = X.logical_types + name_to_col_num = dict((col, col_num) for col_num, col in enumerate(list(X.columns))) + X_renamed = X_t.rename(columns=name_to_col_num, inplace=False) + if isinstance(X, ww.DataTable): + renamed_logical_types = dict((name_to_col_num[col], logical_types[col]) for col in logical_types) + return ww.DataTable(X_renamed, logical_types=renamed_logical_types) + return X_renamed def jupyter_check(): @@ -277,19 +287,22 @@ def _convert_to_woodwork_structure(data): """ Takes input data structure, and if it is not a Woodwork data structure already, will convert it to a Woodwork DataTable or DataColumn structure. """ + ww_data = data if isinstance(data, ww.DataTable) or isinstance(data, ww.DataColumn): - return data + return ww_data # Convert numpy data structures to pandas data structures if isinstance(data, list): - data = np.array(data) - if isinstance(data, pd.api.extensions.ExtensionArray) or (isinstance(data, np.ndarray) and len(data.shape) == 1): - data = pd.Series(data) - elif isinstance(data, np.ndarray): - data = pd.DataFrame(data) + ww_data = np.array(data) + + if isinstance(ww_data, pd.api.extensions.ExtensionArray) or (isinstance(ww_data, np.ndarray) and len(ww_data.shape) == 1): + ww_data = pd.Series(ww_data) + elif isinstance(ww_data, np.ndarray): + ww_data = pd.DataFrame(ww_data) + # Convert pandas data structures to Woodwork data structures - if isinstance(data, pd.Series): - return ww.DataColumn(data) - return ww.DataTable(data) + if isinstance(ww_data, pd.Series): + return ww.DataColumn(ww_data) + return ww.DataTable(ww_data, copy_dataframe=True) def _convert_woodwork_types_wrapper(pd_data): @@ -300,18 +313,30 @@ def _convert_woodwork_types_wrapper(pd_data): pd_data (pd.Series, pd.DataFrame, pd.ExtensionArray): Pandas data structure Returns: - New pandas data structure (pd.DataFrame or pd.Series) with original data and dtypes that can be handled by numpy + Modified pandas data structure (pd.DataFrame or pd.Series) with original data and dtypes that can be handled by numpy """ nullable_to_numpy_mapping = {pd.Int64Dtype: 'int64', pd.BooleanDtype: 'bool', pd.StringDtype: 'object'} - - if isinstance(pd_data, pd.Series) and type(pd_data.dtype) in nullable_to_numpy_mapping: - return pd_data.astype(nullable_to_numpy_mapping[type(pd_data.dtype)]) + nullable_to_numpy_mapping_nan = {pd.Int64Dtype: 'float64', + pd.BooleanDtype: 'object', + pd.StringDtype: 'object'} + + if isinstance(pd_data, pd.api.extensions.ExtensionArray): + if pd.isna(pd_data).any(): + return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)]) + return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)]) + if (isinstance(pd_data, pd.Series) and type(pd_data.dtype) in nullable_to_numpy_mapping): + if pd.isna(pd_data).any(): + return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)], index=pd_data.index) + return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)], index=pd_data.index) if isinstance(pd_data, pd.DataFrame): for col_name, col in pd_data.iteritems(): if type(col.dtype) in nullable_to_numpy_mapping: - pd_data[col_name] = pd_data[col_name].astype(nullable_to_numpy_mapping[type(col.dtype)]) + if pd.isna(pd_data[col_name]).any(): + pd_data[col_name] = pd.Series(pd_data[col_name].to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data[col_name].dtype)]) + else: + pd_data[col_name] = pd_data[col_name].astype(nullable_to_numpy_mapping[type(col.dtype)]) return pd_data