From f54abd3fe2e0f95b2e92c387e5ca575d868eb006 Mon Sep 17 00:00:00 2001
From: Angela Lin <angela97lin@gmail.com>
Date: Wed, 18 Nov 2020 20:12:04 -0500
Subject: [PATCH] Update components to accept Woodwork inputs (#1423)

---
 docs/source/release_notes.rst                 |   7 +-
 evalml/pipelines/classification_pipeline.py   |   8 --
 evalml/pipelines/components/component_base.py |  21 +++-
 .../ensemble/stacked_ensemble_base.py         |  37 +-----
 .../ensemble/stacked_ensemble_classifier.py   |  15 ---
 .../classifiers/baseline_classifier.py        |  17 ++-
 .../classifiers/catboost_classifier.py        |  17 ++-
 .../classifiers/lightgbm_classifier.py        |  50 ++++----
 .../classifiers/xgboost_classifier.py         |  12 +-
 .../components/estimators/estimator.py        |  12 +-
 .../regressors/baseline_regressor.py          |  13 +-
 .../regressors/catboost_regressor.py          |  18 +--
 .../regressors/xgboost_regressor.py           |   9 +-
 .../transformers/column_selectors.py          |  11 +-
 .../dimensionality_reduction/pca.py           |  18 +--
 .../transformers/encoders/onehot_encoder.py   |  57 +++++----
 .../transformers/imputers/imputer.py          |  23 ++--
 .../imputers/per_column_imputer.py            |  28 +++--
 .../transformers/imputers/simple_imputer.py   |  24 ++--
 .../preprocessing/datetime_featurizer.py      |  17 +--
 .../preprocessing/drop_null_columns.py        |  19 +--
 .../transformers/preprocessing/lsa.py         |  16 ++-
 .../preprocessing/text_featurizer.py          |  12 +-
 evalml/pipelines/pipeline_base.py             |  13 +-
 evalml/pipelines/regression_pipeline.py       |   6 -
 .../test_baseline_classifier.py               |  11 +-
 .../test_datetime_featurizer.py               |   2 +-
 evalml/tests/component_tests/test_imputer.py  |  21 ++--
 evalml/tests/component_tests/test_lsa.py      |  14 ---
 .../component_tests/test_one_hot_encoder.py   |  26 +++-
 evalml/tests/component_tests/test_pca.py      |  10 +-
 .../component_tests/test_simple_imputer.py    |  28 +++--
 evalml/tests/utils_tests/test_gen_utils.py    | 118 ++++++++++++++++++
 evalml/utils/__init__.py                      |   1 +
 evalml/utils/gen_utils.py                     |  59 ++++++---
 35 files changed, 465 insertions(+), 305 deletions(-)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index 881daf72b0..f8a4e33cc3 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -3,9 +3,10 @@ Release Notes
 
 **Future Releases**
     * Enhancements
-        * Added ability to freeze hyperparameters for AutoMLSearch :pr:`1284`
-        * Added `Target Encoder` into transformer components :pr:`1401`
-        * Updated pipelines and ``make_pipeline`` to accept Woodwork DataTables :pr:`1393`
+        * Updated pipelines and ``make_pipeline`` to accept ``Woodwork`` inputs :pr:`1393`
+        * Updated components to accept ``Woodwork`` inputs :pr:`1423`
+        * Added ability to freeze hyperparameters for ``AutoMLSearch`` :pr:`1284`
+        * Added ``Target Encoder`` into transformer components :pr:`1401`
         * Added callback for error handling in ``AutoMLSearch`` :pr:`1403`
         * Added the index id to the ``explain_predictions_best_worst`` output to help users identify which rows in their data are included :pr:`1365`
         * The top_k features displayed in ``explain_predictions_*`` functions are now determined by the magnitude of shap values as opposed to the ``top_k`` largest and smallest shap values. :pr:`1374`
diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py
index c03ffca278..e0b759efc6 100644
--- a/evalml/pipelines/classification_pipeline.py
+++ b/evalml/pipelines/classification_pipeline.py
@@ -41,7 +41,6 @@ def fit(self, X, y):
         """
         X = _convert_to_woodwork_structure(X)
         y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         y = _convert_woodwork_types_wrapper(y.to_series())
         self._encoder.fit(y)
         y = self._encode_targets(y)
@@ -92,8 +91,6 @@ def predict(self, X, objective=None):
         Returns:
             pd.Series : Estimated labels
         """
-        X = _convert_to_woodwork_structure(X)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         predictions = self._predict(X, objective)
         return pd.Series(self._decode_targets(predictions))
 
@@ -106,8 +103,6 @@ def predict_proba(self, X):
         Returns:
             pd.DataFrame: Probability estimates
         """
-        X = _convert_to_woodwork_structure(X)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X = self.compute_estimator_features(X, y=None)
         proba = self.estimator.predict_proba(X)
         proba.columns = self._encoder.classes_
@@ -124,11 +119,8 @@ def score(self, X, y, objectives):
         Returns:
             dict: Ordered dictionary of objective scores
         """
-        X = _convert_to_woodwork_structure(X)
         y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         y = _convert_woodwork_types_wrapper(y.to_series())
-
         objectives = [get_objective(o, return_instance=True) for o in objectives]
         y = self._encode_targets(y)
         y_predicted, y_predicted_proba = self._compute_predictions(X, objectives)
diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
index d15e17af7e..df9bd48c91 100644
--- a/evalml/pipelines/components/component_base.py
+++ b/evalml/pipelines/components/component_base.py
@@ -13,6 +13,10 @@
     log_subtitle,
     safe_repr
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 logger = get_logger(__file__)
 
@@ -71,7 +75,7 @@ def clone(self, random_state=0):
         """Constructs a new component with the same parameters
 
         Arguments:
-            random_state (int): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
+            random_state (int, RandomState): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.
 
         Returns:
             A new instance of this component with identical parameters
@@ -82,12 +86,17 @@ def fit(self, X, y=None):
         """Fits component to data
 
         Arguments:
-            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
-            y (pd.Series, optional): the target training data of length [n_samples]
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
+            y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]
 
         Returns:
             self
         """
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        if y is not None:
+            y = _convert_to_woodwork_structure(y)
+            y = _convert_woodwork_types_wrapper(y.to_series())
         try:
             self._component_obj.fit(X, y)
             return self
@@ -119,8 +128,8 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL):
         """Saves component at file path
 
         Arguments:
-            file_path (str): location to save file
-            pickle_protocol (int): the pickle data stream format.
+            file_path (str): Location to save file
+            pickle_protocol (int): The pickle data stream format.
 
         Returns:
             None
@@ -133,7 +142,7 @@ def load(file_path):
         """Loads component at file path
 
         Arguments:
-            file_path (str): location to load file
+            file_path (str): Location to load file
 
         Returns:
             ComponentBase object
diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py
index 852381d96d..025bc52daf 100644
--- a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py
+++ b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py
@@ -1,5 +1,3 @@
-import pandas as pd
-
 from evalml.exceptions import EnsembleMissingPipelinesError
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components import Estimator
@@ -78,33 +76,8 @@ def default_parameters(cls):
          Returns:
              dict: default parameters for this component.
         """
-        return {'final_estimator': None,
-                'cv': None,
-                'n_jobs': 1,
-                }
-
-    def fit(self, X, y=None):
-        """Fits component to data
-
-        Arguments:
-            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
-            y (pd.Series, optional): the target training data of length [n_samples]
-
-        Returns:
-            self
-        """
-        self._component_obj.fit(X, y)
-        return self
-
-    def predict(self, X):
-        """Make predictions using selected features.
-
-        Arguments:
-            X (pd.DataFrame): Features
-
-        Returns:
-            pd.Series: Predicted values
-        """
-        predictions = self._component_obj.predict(X)
-        predictions = pd.Series(predictions)
-        return predictions
+        return {
+            'final_estimator': None,
+            'cv': None,
+            'n_jobs': 1,
+        }
diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py
index f8cb9b6b09..96f88e2672 100644
--- a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py
+++ b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from sklearn.ensemble import StackingClassifier
 from sklearn.model_selection import StratifiedKFold
 
@@ -41,17 +40,3 @@ def __init__(self, input_pipelines=None, final_estimator=None,
         """
         super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator,
                          cv=cv, n_jobs=n_jobs, random_state=random_state, **kwargs)
-
-    def predict_proba(self, X):
-        """Make probability estimates for labels.
-
-        Arguments:
-            X (pd.DataFrame): Features
-
-        Returns:
-            pd.DataFrame: Probability estimates
-        """
-        pred_proba = self._component_obj.predict_proba(X)
-        if not isinstance(pred_proba, pd.DataFrame):
-            pred_proba = pd.DataFrame(pred_proba)
-        return pred_proba
diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py
index 6c7f5f2f22..81dbfb897e 100644
--- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py
+++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py
@@ -5,6 +5,10 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class BaselineClassifier(Estimator):
@@ -40,11 +44,10 @@ def __init__(self, strategy="mode", random_state=0, **kwargs):
     def fit(self, X, y=None):
         if y is None:
             raise ValueError("Cannot fit Baseline classifier if y is None")
-
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         vals, counts = np.unique(y, return_counts=True)
         self._classes = list(vals)
@@ -57,6 +60,8 @@ def fit(self, X, y=None):
         return self
 
     def predict(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         strategy = self.parameters["strategy"]
         if strategy == "mode":
             return pd.Series([self._mode] * len(X))
@@ -66,6 +71,8 @@ def predict(self, X):
             return self.random_state.choice(self._classes, len(X), p=self._percentage_freq)
 
     def predict_proba(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         strategy = self.parameters["strategy"]
         if strategy == "mode":
             mode_index = self._classes.index(self._mode)
diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
index b2fddb00b6..691507a998 100644
--- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
+++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
@@ -9,7 +9,10 @@
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
-from evalml.utils.gen_utils import categorical_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class CatBoostClassifier(Estimator):
@@ -56,11 +59,11 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
-        cat_cols = X.select_dtypes(categorical_dtypes)
+        X = _convert_to_woodwork_structure(X)
+        cat_cols = list(X.select('category').columns)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         # For binary classification, catboost expects numeric values, so encoding before.
         if y.nunique() <= 2:
@@ -70,6 +73,8 @@ def fit(self, X, y=None):
         return model
 
     def predict(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         predictions = self._component_obj.predict(X)
         if predictions.ndim == 2 and predictions.shape[1] == 1:
             predictions = predictions.flatten()
diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py
index 7066f58aa1..4700fe48e1 100644
--- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py
+++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py
@@ -10,7 +10,11 @@
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
-from evalml.utils.gen_utils import categorical_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    _rename_column_names_to_numeric
+)
 
 
 class LightGBMClassifier(Estimator):
@@ -69,42 +73,42 @@ def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, ma
                          random_state=random_seed)
 
     def _encode_categories(self, X, fit=False):
-        X2 = pd.DataFrame(copy.copy(X))
-        # encode each categorical feature as an integer
-        X2.columns = np.arange(len(X2.columns))
-        # necessary to wipe out column names in case any names contain symbols ([, ], <) which LightGBM cannot properly handle
-        cat_cols = X2.select_dtypes(categorical_dtypes).columns
+        """Encodes each categorical feature using ordinal encoding."""
+        X_encoded = _convert_to_woodwork_structure(X)
+        X_encoded = _rename_column_names_to_numeric(X_encoded)
+        cat_cols = list(X_encoded.select('category').columns)
+        X_encoded = _convert_woodwork_types_wrapper(X_encoded.to_dataframe())
         if len(cat_cols) == 0:
-            return X2
+            return X_encoded
         if fit:
             self._ordinal_encoder = OrdinalEncoder()
-            encoder_output = self._ordinal_encoder.fit_transform(X2[cat_cols])
+            encoder_output = self._ordinal_encoder.fit_transform(X_encoded[cat_cols])
         else:
-            encoder_output = self._ordinal_encoder.transform(X2[cat_cols])
-        X2[cat_cols] = pd.DataFrame(encoder_output)
-        X2[cat_cols] = X2[cat_cols].astype('category')
-        return X2
+            encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols])
+        X_encoded[cat_cols] = pd.DataFrame(encoder_output)
+        X_encoded[cat_cols] = X_encoded[cat_cols].astype('category')
+        return X_encoded
 
     def _encode_labels(self, y):
-        y1 = pd.Series(y)
+        y_encoded = pd.Series(y)
         # change only if dtype isn't int
-        if not is_integer_dtype(y1):
+        if not is_integer_dtype(y_encoded):
             self._label_encoder = LabelEncoder()
-            y1 = pd.Series(self._label_encoder.fit_transform(y1), dtype='int64')
-        return y1
+            y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64')
+        return y_encoded
 
     def fit(self, X, y=None):
-        X2 = self._encode_categories(X, fit=True)
-        y2 = self._encode_labels(y)
-        return super().fit(X2, y2)
+        X_encoded = self._encode_categories(X, fit=True)
+        y_encoded = self._encode_labels(y)
+        return super().fit(X_encoded, y_encoded)
 
     def predict(self, X):
-        X2 = self._encode_categories(X)
-        predictions = super().predict(X2)
+        X_encoded = self._encode_categories(X)
+        predictions = super().predict(X_encoded)
         if self._label_encoder:
             predictions = pd.Series(self._label_encoder.inverse_transform(predictions.astype(np.int64)))
         return predictions
 
     def predict_proba(self, X):
-        X2 = self._encode_categories(X)
-        return super().predict_proba(X2)
+        X_encoded = self._encode_categories(X)
+        return super().predict_proba(X_encoded)
diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py
index 165bd0d7b8..b937fc8e9f 100644
--- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py
+++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from skopt.space import Integer, Real
 
 from evalml.model_family import ModelFamily
@@ -42,21 +41,16 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         return super().fit(X, y)
 
     def predict(self, X):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         predictions = super().predict(X)
         return predictions
 
     def predict_proba(self, X):
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         predictions = super().predict_proba(X)
         return predictions
 
diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py
index 252ae2a605..1ad9e98455 100644
--- a/evalml/pipelines/components/estimators/estimator.py
+++ b/evalml/pipelines/components/estimators/estimator.py
@@ -4,6 +4,10 @@
 
 from evalml.exceptions import MethodPropertyNotFoundError
 from evalml.pipelines.components import ComponentBase
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class Estimator(ComponentBase):
@@ -28,12 +32,14 @@ def predict(self, X):
         """Make predictions using selected features.
 
         Arguments:
-            X (pd.DataFrame): Features
+            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
 
         Returns:
             pd.Series: Predicted values
         """
         try:
+            X = _convert_to_woodwork_structure(X)
+            X = _convert_woodwork_types_wrapper(X.to_dataframe())
             predictions = self._component_obj.predict(X)
         except AttributeError:
             raise MethodPropertyNotFoundError("Estimator requires a predict method or a component_obj that implements predict")
@@ -45,12 +51,14 @@ def predict_proba(self, X):
         """Make probability estimates for labels.
 
         Arguments:
-            X (pd.DataFrame): Features
+            X (ww.DataTable, pd.DataFrame, or np.ndarray): Features
 
         Returns:
             pd.DataFrame: Probability estimates
         """
         try:
+            X = _convert_to_woodwork_structure(X)
+            X = _convert_woodwork_types_wrapper(X.to_dataframe())
             pred_proba = self._component_obj.predict_proba(X)
         except AttributeError:
             raise MethodPropertyNotFoundError("Estimator requires a predict_proba method or a component_obj that implements predict_proba")
diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py
index ee31b20526..2b46addd99 100644
--- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py
+++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py
@@ -4,6 +4,10 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class BaselineRegressor(Estimator):
@@ -38,9 +42,10 @@ def __init__(self, strategy="mean", random_state=0, **kwargs):
     def fit(self, X, y=None):
         if y is None:
             raise ValueError("Cannot fit Baseline regressor if y is None")
-
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         if self.parameters["strategy"] == "mean":
             self._prediction_value = y.mean()
@@ -50,6 +55,8 @@ def fit(self, X, y=None):
         return self
 
     def predict(self, X):
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         return pd.Series([self._prediction_value] * len(X))
 
     @property
diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
index 042eb32979..505cff948a 100644
--- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
+++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
@@ -1,13 +1,15 @@
 import copy
 
-import pandas as pd
 from skopt.space import Integer, Real
 
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
-from evalml.utils.gen_utils import categorical_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class CatBoostRegressor(Estimator):
@@ -53,11 +55,13 @@ def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None,
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
-        cat_cols = X.select_dtypes(categorical_dtypes)
+        X = _convert_to_woodwork_structure(X)
+        cat_cols = list(X.select('category').columns)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
+
         model = self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
         return model
 
diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py
index b620b68715..6d2e5b94b7 100644
--- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py
+++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from skopt.space import Integer, Real
 
 from evalml.model_family import ModelFamily
@@ -42,15 +41,11 @@ def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, r
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         return super().fit(X, y)
 
     def predict(self, X):
-        # rename column names to column number if input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot properly handle
-        if isinstance(X, pd.DataFrame):
-            X = _rename_column_names_to_numeric(X)
+        X = _rename_column_names_to_numeric(X)
         predictions = super().predict(X)
         return predictions
 
diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py
index b71cef566d..1e176c8f1d 100644
--- a/evalml/pipelines/components/transformers/column_selectors.py
+++ b/evalml/pipelines/components/transformers/column_selectors.py
@@ -1,9 +1,12 @@
 from abc import abstractmethod
 
 import numpy as np
-import pandas as pd
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class ColumnSelector(Transformer):
@@ -56,10 +59,8 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X, y=None):
-
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         self._check_input_for_columns(X)
 
         cols = self.parameters.get("columns") or []
diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py
index e4a8974eec..b5e935c8d9 100644
--- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py
+++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py
@@ -3,7 +3,11 @@
 from skopt.space import Real
 
 from evalml.pipelines.components.transformers import Transformer
-from evalml.utils.gen_utils import is_all_numeric
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    is_all_numeric
+)
 
 
 class PCA(Transformer):
@@ -33,8 +37,8 @@ def __init__(self, variance=0.95, n_components=None, random_state=0, **kwargs):
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         if not is_all_numeric(X):
             raise ValueError("PCA input must be all numeric")
 
@@ -42,8 +46,8 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         if not is_all_numeric(X):
             raise ValueError("PCA input must be all numeric")
 
@@ -51,8 +55,8 @@ def transform(self, X, y=None):
         return pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])])
 
     def fit_transform(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         if not is_all_numeric(X):
             raise ValueError("PCA input must be all numeric")
 
diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py
index d80c266201..a095334439 100644
--- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py
+++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py
@@ -3,9 +3,12 @@
 import pandas as pd
 from sklearn.preprocessing import OneHotEncoder as SKOneHotEncoder
 
-from ..transformer import Transformer
-
 from evalml.pipelines.components import ComponentBaseMeta
+from evalml.pipelines.components.transformers.transformer import Transformer
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class OneHotEncoderMeta(ComponentBaseMeta):
@@ -82,20 +85,16 @@ def _get_cat_cols(X):
 
     def fit(self, X, y=None):
         top_n = self.parameters['top_n']
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X_t = X
-
         if self.features_to_encode is None:
             self.features_to_encode = self._get_cat_cols(X_t)
         invalid_features = [col for col in self.features_to_encode if col not in list(X.columns)]
         if len(invalid_features) > 0:
             raise ValueError("Could not find and encode {} in input data.".format(', '.join(invalid_features)))
 
-        if self.parameters['handle_missing'] == "as_category":
-            X_t[self.features_to_encode] = X_t[self.features_to_encode].replace(np.nan, "nan")
-        elif self.parameters['handle_missing'] == "error" and X.isnull().any().any():
-            raise ValueError("Input contains NaN")
+        X_t = self._handle_parameter_handle_missing(X_t)
 
         if len(self.features_to_encode) == 0:
             categories = 'auto'
@@ -137,35 +136,41 @@ def transform(self, X, y=None):
         Returns:
             Transformed dataframe, where each categorical feature has been encoded into numerical columns using one-hot encoding.
         """
-
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-
-        cat_cols = self.features_to_encode
-
-        if self.parameters['handle_missing'] == "as_category":
-            X[cat_cols] = X[cat_cols].replace(np.nan, "nan")
-        if self.parameters['handle_missing'] == "error" and X.isnull().any().any():
-            raise ValueError("Input contains NaN")
+        X_copy = _convert_to_woodwork_structure(X)
+        X_copy = _convert_woodwork_types_wrapper(X_copy.to_dataframe())
+        X_copy = self._handle_parameter_handle_missing(X_copy)
 
         X_t = pd.DataFrame()
         # Add the non-categorical columns, untouched
-        for col in X.columns:
-            if col not in cat_cols:
-                X_t = pd.concat([X_t, X[col]], axis=1)
+        for col in X_copy.columns:
+            if col not in self.features_to_encode:
+                X_t = pd.concat([X_t, X_copy[col]], axis=1)
         # The call to pd.concat above changes the type of the index so we will manually keep it the same.
         if not X_t.empty:
-            X_t.index = X.index
+            X_t.index = X_copy.index
 
         # Call sklearn's transform on the categorical columns
-        if len(cat_cols) > 0:
-            X_cat = pd.DataFrame(self._encoder.transform(X[cat_cols]).toarray(), index=X.index)
-            cat_cols_str = [str(c) for c in cat_cols]
+        if len(self.features_to_encode) > 0:
+            X_cat = pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(), index=X_copy.index)
+            cat_cols_str = [str(c) for c in self.features_to_encode]
             X_cat.columns = self._encoder.get_feature_names(input_features=cat_cols_str)
             X_t = pd.concat([X_t, X_cat], axis=1)
 
         return X_t
 
+    def _handle_parameter_handle_missing(self, X):
+        """Helper method to handle the `handle_missing` parameter."""
+        cat_cols = self.features_to_encode
+        if self.parameters['handle_missing'] == "error" and X.isnull().any().any():
+            raise ValueError("Input contains NaN")
+        if self.parameters['handle_missing'] == "as_category":
+            for col in cat_cols:
+                if X[col].dtype == 'category' and pd.isna(X[col]).any():
+                    X[col] = X[col].cat.add_categories("nan")
+                    X[col] = X[col].where(~pd.isna(X[col]), other='nan')
+            X[cat_cols] = X[cat_cols].replace(np.nan, "nan")
+        return X
+
     def categories(self, feature_name):
         """Returns a list of the unique categories to be encoded for the particular feature, in order.
 
diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py
index 9dd0df8285..82c9dbd0aa 100644
--- a/evalml/pipelines/components/transformers/imputers/imputer.py
+++ b/evalml/pipelines/components/transformers/imputers/imputer.py
@@ -1,8 +1,9 @@
-import pandas as pd
-
 from evalml.pipelines.components.transformers import Transformer
 from evalml.pipelines.components.transformers.imputers import SimpleImputer
-from evalml.utils.gen_utils import boolean, categorical_dtypes, numeric_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class Imputer(Transformer):
@@ -62,19 +63,22 @@ def fit(self, X, y=None):
         Returns:
             self
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        cat_cols = list(X.select('category').columns)
+        numeric_cols = list(X.select('numeric').columns)
+
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
 
         self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
         X_copy = X.copy()
         X_null_dropped = X_copy.drop(self._all_null_cols, axis=1, errors='ignore')
 
-        X_numerics = X_null_dropped.select_dtypes(include=numeric_dtypes)
+        X_numerics = X_null_dropped[[col for col in numeric_cols if col not in self._all_null_cols]]
         if len(X_numerics.columns) > 0:
             self._numeric_imputer.fit(X_numerics, y)
             self._numeric_cols = X_numerics.columns
 
-        X_categorical = X_null_dropped.select_dtypes(include=categorical_dtypes + boolean)
+        X_categorical = X_null_dropped[[col for col in cat_cols if col not in self._all_null_cols]]
         if len(X_categorical.columns) > 0:
             self._categorical_imputer.fit(X_categorical, y)
             self._categorical_cols = X_categorical.columns
@@ -91,8 +95,9 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame: Transformed X
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+
         X_null_dropped = X.copy()
         X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore')
         X_null_dropped.reset_index(inplace=True, drop=True)
diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py
index af4ffccb74..8ddf18bc37 100644
--- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py
+++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py
@@ -1,9 +1,11 @@
-import pandas as pd
-
 from evalml.pipelines.components.transformers import Transformer
 from evalml.pipelines.components.transformers.imputers.simple_imputer import (
     SimpleImputer
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class PerColumnImputer(Transformer):
@@ -43,14 +45,14 @@ def fit(self, X, y=None):
         """Fits imputers on input data
 
         Arguments:
-            X (pd.DataFrame): Data to fit
-            y (pd.Series, optional): Ignored.
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit.
+            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored.
 
         Returns:
             self
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         self.imputers = dict()
         for column in X.columns:
             strategy_dict = self.impute_strategies.get(column, dict())
@@ -64,17 +66,17 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X, y=None):
-        """Transforms input data by imputing missing values
+        """Transforms input data by imputing missing values.
 
         Arguments:
-            X (pd.DataFrame): Data to transform
-            y (pd.Series, optional): Ignored.
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform.
+            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored.
 
         Returns:
             pd.DataFrame: Transformed X
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X_t = X.copy()
         cols_to_drop = []
         for column, imputer in self.imputers.items():
@@ -90,8 +92,8 @@ def fit_transform(self, X, y=None):
         """Fits imputer and imputes missing values in input data.
 
         Arguments:
-            X (pd.DataFrame): Data to fit and transform
-            y (pd.Series): Target data.
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform.
+            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored.
 
         Returns:
             pd.DataFrame: Transformed X
diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py
index 6b7d42403a..55cc268671 100644
--- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py
+++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py
@@ -3,6 +3,10 @@
 from sklearn.impute import SimpleImputer as SkImputer
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class SimpleImputer(Transformer):
@@ -35,14 +39,14 @@ def fit(self, X, y=None):
             treated as the same.
 
         Arguments:
-            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
-            y (pd.Series, optional): the target training data of length [n_samples]
+            X (ww.DataTable, pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
+            y (ww.DataColumn, pd.Series, optional): the target training data of length [n_samples]
 
         Returns:
             self
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         # Convert None to np.nan, since None cannot be properly handled
         X = X.fillna(value=np.nan)
 
@@ -55,14 +59,14 @@ def transform(self, X, y=None):
             treated as the same.
 
         Arguments:
-            X (pd.DataFrame): Data to transform
-            y (pd.Series, optional): Ignored.
+            X (ww.DataTable, pd.DataFrame): Data to transform
+            y (ww.DataColumn, pd.Series, optional): Ignored.
 
         Returns:
             pd.DataFrame: Transformed X
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         # Convert None to np.nan, since None cannot be properly handled
         X = X.fillna(value=np.nan)
 
@@ -81,8 +85,8 @@ def fit_transform(self, X, y=None):
         """Fits on X and transforms X
 
         Arguments:
-            X (pd.DataFrame): Data to fit and transform
-            y (pd. DataFrame): Target data.
+            X (ww.DataTable, pd.DataFrame): Data to fit and transform
+            y (ww.DataColumn, pd.Series, optional): Target data.
 
         Returns:
             pd.DataFrame: Transformed X
diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py
index 39f3114f62..4ccb6c42ad 100644
--- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py
@@ -1,7 +1,9 @@
-import pandas as pd
-
 from evalml.pipelines.components.transformers import Transformer
-from evalml.utils.gen_utils import datetime_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    datetime_dtypes
+)
 
 
 def _extract_year(col):
@@ -51,8 +53,8 @@ def __init__(self, features_to_extract=None, random_state=0, **kwargs):
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         self._date_time_col_names = X.select_dtypes(include=datetime_dtypes).columns
         return self
 
@@ -66,10 +68,9 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame: Transformed X
         """
-
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X_t = X
-        if not isinstance(X_t, pd.DataFrame):
-            X_t = pd.DataFrame(X_t)
         features_to_extract = self.parameters["features_to_extract"]
         if len(features_to_extract) == 0:
             return X_t
diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py
index b9cbe33e6b..9ba7981b5c 100644
--- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py
+++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py
@@ -1,6 +1,8 @@
-import pandas as pd
-
 from evalml.pipelines.components.transformers import Transformer
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class DropNullColumns(Transformer):
@@ -28,9 +30,9 @@ def __init__(self, pct_null_threshold=1.0, random_state=0, **kwargs):
 
     def fit(self, X, y=None):
         pct_null_threshold = self.parameters["pct_null_threshold"]
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        percent_null = X.isnull().mean()
+        X_t = _convert_to_woodwork_structure(X)
+        X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe())
+        percent_null = X_t.isnull().mean()
         if pct_null_threshold == 0.0:
             null_cols = percent_null[percent_null > 0]
         else:
@@ -48,7 +50,6 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame: Transformed X
         """
-
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        return X.drop(columns=self._cols_to_drop, axis=1)
+        X_t = _convert_to_woodwork_structure(X)
+        X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe())
+        return X_t.drop(columns=self._cols_to_drop, axis=1)
diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index 9871f147d9..2cce495e8d 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -6,6 +6,10 @@
 from evalml.pipelines.components.transformers.preprocessing import (
     TextTransformer
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class LSA(TextTransformer):
@@ -28,8 +32,8 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
     def fit(self, X, y=None):
         if len(self._all_text_columns) == 0:
             return self
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         text_columns = self._get_text_columns(X)
         corpus = X[text_columns].values.flatten()
         # we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard.
@@ -41,15 +45,15 @@ def transform(self, X, y=None):
         """Transforms data X by applying the LSA pipeline.
 
         Arguments:
-            X (pd.DataFrame): Data to transform
-            y (pd.Series, optional): Ignored.
+            X (ww.DataTable, pd.DataFrame): Data to transform
+            y (ww.DataColumn, pd.Series, optional): Ignored.
 
         Returns:
             pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the
                           format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         if len(self._all_text_columns) == 0:
             return X
 
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index ff74c2f4f7..a58385018e 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -8,6 +8,10 @@
     LSA,
     TextTransformer
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class TextFeaturizer(TextTransformer):
@@ -70,8 +74,8 @@ def fit(self, X, y=None):
         """
         if len(self._all_text_columns) == 0:
             return self
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
 
         text_columns = self._get_text_columns(X)
         es = self._make_entity_set(X, text_columns)
@@ -92,8 +96,8 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame: Transformed X
         """
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         if self._features is None or len(self._features) == 0:
             return X
 
diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
index 1dbc3caa63..c0a6ee1312 100644
--- a/evalml/pipelines/pipeline_base.py
+++ b/evalml/pipelines/pipeline_base.py
@@ -20,6 +20,7 @@
 )
 from evalml.pipelines.pipeline_base_meta import PipelineBaseMeta
 from evalml.utils import (
+    _convert_to_woodwork_structure,
     check_random_state_equality,
     classproperty,
     get_logger,
@@ -30,10 +31,6 @@
     log_title,
     safe_repr
 )
-from evalml.utils.gen_utils import (
-    _convert_to_woodwork_structure,
-    _convert_woodwork_types_wrapper
-)
 
 logger = get_logger(__file__)
 
@@ -197,16 +194,13 @@ def compute_estimator_features(self, X, y=None):
     def _compute_features_during_fit(self, X, y):
         X_t = X
         for component in self.component_graph[:-1]:
-            self.input_feature_names.update({component.name: list(pd.DataFrame(X_t))})
+            self.input_feature_names.update({component.name: list(X_t.columns)})
             X_t = component.fit_transform(X_t, y=y)
-
-        self.input_feature_names.update({self.estimator.name: list(pd.DataFrame(X_t))})
-
+        self.input_feature_names.update({self.estimator.name: list(X_t.columns)})
         return X_t
 
     def _fit(self, X, y):
         X_t = self._compute_features_during_fit(X, y)
-
         self.estimator.fit(X_t, y)
 
     @abstractmethod
@@ -233,7 +227,6 @@ def predict(self, X, objective=None):
             pd.Series: Predicted values.
         """
         X = _convert_to_woodwork_structure(X)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         X_t = self.compute_estimator_features(X, y=None)
         return self.estimator.predict(X_t)
 
diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py
index 66b14e3640..7c28087bfe 100644
--- a/evalml/pipelines/regression_pipeline.py
+++ b/evalml/pipelines/regression_pipeline.py
@@ -26,7 +26,6 @@ def fit(self, X, y):
         """
         X = _convert_to_woodwork_structure(X)
         y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
         y = _convert_woodwork_types_wrapper(y.to_series())
         if y.dtype not in numeric_dtypes:
             raise ValueError(f"Regression pipeline cannot handle targets with dtype: {y.dtype}")
@@ -44,11 +43,6 @@ def score(self, X, y, objectives):
         Returns:
             dict: Ordered dictionary of objective scores
         """
-        X = _convert_to_woodwork_structure(X)
-        y = _convert_to_woodwork_structure(y)
-        X = _convert_woodwork_types_wrapper(X.to_dataframe())
-        y = _convert_woodwork_types_wrapper(y.to_series())
-
         objectives = [get_objective(o, return_instance=True) for o in objectives]
         y_predicted = self.predict(X)
         return self._score_all_objectives(X, y, y_predicted, y_pred_proba=None, objectives=objectives)
diff --git a/evalml/tests/component_tests/test_baseline_classifier.py b/evalml/tests/component_tests/test_baseline_classifier.py
index 081996de1c..3333e49fed 100644
--- a/evalml/tests/component_tests/test_baseline_classifier.py
+++ b/evalml/tests/component_tests/test_baseline_classifier.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import woodwork as ww
 
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components import BaselineClassifier
@@ -25,15 +26,19 @@ def test_baseline_y_is_None(X_y_binary):
         BaselineClassifier().fit(X, y=None)
 
 
-def test_baseline_binary_mode(X_y_binary):
+@pytest.mark.parametrize('data_type', ['pd', 'ww'])
+def test_baseline_binary_mode(data_type, X_y_binary):
     X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]})
     y = pd.Series([10, 11, 10, 10])
+    if data_type == 'ww':
+        X = ww.DataTable(X)
+        y = ww.DataColumn(y)
     clf = BaselineClassifier(strategy="mode")
     clf.fit(X, y)
     assert clf.classes_ == [10, 11]
-    np.testing.assert_allclose(clf.predict(X), np.array([10] * len(X)))
+    np.testing.assert_allclose(clf.predict(X), np.array([10] * X.shape[0]))
     predicted_proba = clf.predict_proba(X)
-    assert predicted_proba.shape == (len(X), 2)
+    assert predicted_proba.shape == (X.shape[0], 2)
     expected_predicted_proba = pd.DataFrame({10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.]})
     pd.testing.assert_frame_equal(expected_predicted_proba, predicted_proba)
     np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
diff --git a/evalml/tests/component_tests/test_datetime_featurizer.py b/evalml/tests/component_tests/test_datetime_featurizer.py
index 1e1b056982..a01c75927f 100644
--- a/evalml/tests/component_tests/test_datetime_featurizer.py
+++ b/evalml/tests/component_tests/test_datetime_featurizer.py
@@ -77,6 +77,6 @@ def test_datetime_featurizer_no_datetime_cols():
 
 def test_datetime_featurizer_numpy_array_input():
     datetime_transformer = DateTimeFeaturizer()
-    X = np.array(['2007-02-03', '2016-06-07', '2020-05-19'], dtype='datetime64')
+    X = np.array([['2007-02-03'], ['2016-06-07'], ['2020-05-19']], dtype='datetime64')
     datetime_transformer.fit(X)
     assert list(datetime_transformer.transform(X).columns) == ["0_year", "0_month", "0_day_of_week", "0_hour"]
diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
index 2cf7cbe764..4401562a88 100644
--- a/evalml/tests/component_tests/test_imputer.py
+++ b/evalml/tests/component_tests/test_imputer.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import woodwork as ww
 from pandas.testing import assert_frame_equal
 
 from evalml.pipelines.components import Imputer
@@ -95,10 +96,10 @@ def test_categorical_only_input(imputer_test_data):
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({
         "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
-        "object col": ["b", "b", "a", "c", "d"],
+        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
         "bool col": [True, False, False, True, True],
         "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
-        "object with nan": ["b", "b", "b", "c", "b"],
+        "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
         "bool col with nan": [True, True, False, True, True]
     })
 
@@ -116,13 +117,13 @@ def test_categorical_and_numeric_input(imputer_test_data):
     expected = pd.DataFrame({
         "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
         "int col": [0, 1, 2, 0, 3],
-        "object col": ["b", "b", "a", "c", "d"],
+        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
         "float col": [0.0, 1.0, 0.0, -2.0, 5.],
         "bool col": [True, False, False, True, True],
         "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
         "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
         "float with nan": [0.0, 1.0, 0, -1.0, 0.],
-        "object with nan": ["b", "b", "b", "c", "b"],
+        "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
         "bool col with nan": [True, True, False, True, True]
     })
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -181,12 +182,16 @@ def test_imputer_datetime_input():
     assert_frame_equal(transformed, X, check_dtype=False)
 
 
-@pytest.mark.parametrize("data_type", ['np', 'pd'])
+@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww'])
 def test_imputer_empty_data(data_type):
     if data_type == 'pd':
         X = pd.DataFrame()
         y = pd.Series()
         expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([]))
+    elif data_type == 'ww':
+        X = ww.DataTable(pd.DataFrame())
+        y = ww.DataColumn(pd.Series())
+        expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([]))
     else:
         X = np.array([[]])
         y = np.array([])
@@ -234,7 +239,7 @@ def test_imputer_fill_value(imputer_test_data):
         "int with nan": [-1, 1, 0, 0, 1],
         "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
         "float with nan": [0.0, 1.0, -1, -1.0, 0.],
-        "object with nan": ["b", "b", "fill", "c", "fill"],
+        "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
         "bool col with nan": [True, "fill", False, "fill", True]
     })
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -254,7 +259,7 @@ def test_imputer_no_nans(imputer_test_data):
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({
         "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
-        "object col": ["b", "b", "a", "c", "d"],
+        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
         "bool col": [True, False, False, True, True],
     })
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -280,7 +285,7 @@ def test_imputer_with_none():
                              "float with None": [0.1, 0.0, 0.5, 0.2],
                              "category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
                              "boolean with None": [True, True, False, True],
-                             "object with None": ["b", "a", "a", "a"]})
+                             "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')})
     assert_frame_equal(transformed, expected, check_dtype=False)
 
     imputer = Imputer()
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index 5c86a16f0f..22435736d1 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -152,20 +152,6 @@ def test_int_col_names():
     assert X_t.dtypes.all() == np.float64
 
 
-def test_repeat_col_names():
-    X = pd.DataFrame(data=np.array([['identical string one', 'identical string one'],
-                                    ['second double string', 'second double string'],
-                                    ['copy the third', 'copy the third']]), columns=['col_1', 'col_1'])
-    lsa = LSA(text_columns=['col_1', 'col_1'])
-    lsa.fit(X)
-    expected_col_names = ['LSA(col_1)[0]',
-                          'LSA(col_1)[1]']
-    X_t = lsa.transform(X)
-    np.testing.assert_array_equal(X_t.columns, np.array(expected_col_names))
-    assert len(X_t.columns) == 2
-    assert X_t.dtypes.all() == np.float64
-
-
 def test_lsa_output():
     X = pd.DataFrame(
         {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index cf9549eccd..22308e7d95 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -1,10 +1,15 @@
 import numpy as np
 import pandas as pd
 import pytest
+import woodwork as ww
 
 from evalml.exceptions import ComponentNotYetFittedError
 from evalml.pipelines.components import OneHotEncoder
-from evalml.utils import get_random_state
+from evalml.utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    get_random_state
+)
 
 
 def test_init():
@@ -147,7 +152,6 @@ def test_no_top_n():
                       "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"],
                       "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"],
                       "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2]})
-
     expected_col_names = set(["col_3_a", "col_3_b", "col_4"])
     for val in X["col_1"]:
         expected_col_names.add("col_1_" + val)
@@ -228,6 +232,10 @@ def test_more_top_n_unique_values():
     encoder = OneHotEncoder(top_n=5, random_state=random_seed)
     encoder.fit(X)
     X_t = encoder.transform(X)
+
+    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
+    X = _convert_to_woodwork_structure(X)
+    X = _convert_woodwork_types_wrapper(X.to_dataframe())
     col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
     col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
     col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort')
@@ -260,6 +268,10 @@ def test_more_top_n_unique_values_large():
     encoder = OneHotEncoder(top_n=3, random_state=random_seed)
     encoder.fit(X)
     X_t = encoder.transform(X)
+
+    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
+    X = _convert_to_woodwork_structure(X)
+    X = _convert_woodwork_types_wrapper(X.to_dataframe())
     col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
     col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state)
     col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort')
@@ -310,7 +322,7 @@ def test_numpy_input():
     encoder = OneHotEncoder()
     encoder.fit(X)
     X_t = encoder.transform(X)
-    assert pd.DataFrame(X).equals(X_t)
+    pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False)
 
 
 def test_large_number_of_categories():
@@ -329,16 +341,18 @@ def test_large_number_of_categories():
     assert set(expected_col_names) == set(list(X_t.columns))
 
 
-@pytest.mark.parametrize('data_type', ['list', 'np', 'pd_no_index', 'pd_index'])
+@pytest.mark.parametrize('data_type', ['list', 'np', 'pd_no_index', 'pd_index', 'ww'])
 def test_data_types(data_type):
     if data_type == 'list':
-        X = ["a", "b", "c"]
+        X = [["a"], ["b"], ["c"]]
     elif data_type == 'np':
-        X = np.array(["a", "b", "c"])
+        X = np.array([["a"], ["b"], ["c"]])
     elif data_type == 'pd_no_index':
         X = pd.DataFrame(["a", "b", "c"])
     elif data_type == 'pd_index':
         X = pd.DataFrame(["a", "b", "c"], columns=['0'])
+    elif data_type == 'ww':
+        X = ww.DataTable(pd.DataFrame(["a", "b", "c"]))
     encoder = OneHotEncoder()
     encoder.fit(X)
     X_t = encoder.transform(X)
diff --git a/evalml/tests/component_tests/test_pca.py b/evalml/tests/component_tests/test_pca.py
index d58a158c15..e0748dff33 100644
--- a/evalml/tests/component_tests/test_pca.py
+++ b/evalml/tests/component_tests/test_pca.py
@@ -24,11 +24,11 @@ def test_pca_numeric():
 
 
 def test_pca_array():
-    X = [[3, 0, 1, 6],
-         [1, 2, 1, 6],
-         [10, 2, 1, 6],
-         [10, 2, 2, 5],
-         [6, 2, 2, 5]]
+    X = np.array([[3, 0, 1, 6],
+                  [1, 2, 1, 6],
+                  [10, 2, 1, 6],
+                  [10, 2, 2, 5],
+                  [6, 2, 2, 5]])
     pca = PCA()
     expected_X_t = pd.DataFrame([[3.176246, 1.282616],
                                  [4.969987, -0.702976],
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
index 780baf8b3f..298b2361bc 100644
--- a/evalml/tests/component_tests/test_simple_imputer.py
+++ b/evalml/tests/component_tests/test_simple_imputer.py
@@ -6,7 +6,7 @@
 from evalml.pipelines.components import SimpleImputer
 
 
-def test_median():
+def test_simple_imputer_median():
     X = pd.DataFrame([[np.nan, 0, 1, np.nan],
                       [1, 2, 3, 2],
                       [10, 2, np.nan, 2],
@@ -22,7 +22,7 @@ def test_median():
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
 
-def test_mean():
+def test_simple_imputer_mean():
     X = pd.DataFrame([[np.nan, 0, 1, np.nan],
                       [1, 2, 3, 2],
                       [1, 2, 3, 0]])
@@ -35,7 +35,7 @@ def test_mean():
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
 
-def test_constant():
+def test_simple_imputer_constant():
     # test impute strategy is constant and fill value is not specified
     X = pd.DataFrame([[np.nan, 0, 1, np.nan],
                       ["a", 2, np.nan, 3],
@@ -45,11 +45,12 @@ def test_constant():
     X_expected_arr = pd.DataFrame([[3, 0, 1, 3],
                                    ["a", 2, 3, 3],
                                    ["b", 2, 3, 0]])
+    X_expected_arr = X_expected_arr.astype({0: 'category'})
     X_t = transformer.fit_transform(X)
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
 
-def test_most_frequent():
+def test_simple_imputer_most_frequent():
     X = pd.DataFrame([[np.nan, 0, 1, np.nan],
                       ["a", 2, np.nan, 3],
                       ["b", 2, 1, 0]])
@@ -58,11 +59,12 @@ def test_most_frequent():
     X_expected_arr = pd.DataFrame([["a", 0, 1, 0],
                                    ["a", 2, 1, 3],
                                    ["b", 2, 1, 0]])
+    X_expected_arr = X_expected_arr.astype({0: 'category'})
     X_t = transformer.fit_transform(X)
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
 
-def test_col_with_non_numeric():
+def test_simple_imputer_col_with_non_numeric():
     # test col with all strings
     X = pd.DataFrame([["a", 0, 1, np.nan],
                       ["b", 2, 3, 3],
@@ -86,6 +88,7 @@ def test_col_with_non_numeric():
                                    ["b", 2, 3, 3],
                                    ["a", 2, 3, 1],
                                    ["a", 2, 3, 0]])
+    X_expected_arr = X_expected_arr.astype({0: 'category'})
     X_t = transformer.fit_transform(X)
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
@@ -94,11 +97,12 @@ def test_col_with_non_numeric():
                                    ["b", 2, 3, 3],
                                    ["a", 2, 3, 1],
                                    [2, 2, 3, 0]])
+    X_expected_arr = X_expected_arr.astype({0: 'category'})
     X_t = transformer.fit_transform(X)
     assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
 
 
-def test_fit_transform_drop_all_nan_columns():
+def test_simple_imputer_fit_transform_drop_all_nan_columns():
     X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
                       "some_nan": [np.nan, 1, 0],
                       "another_col": [0, 1, 2]})
@@ -112,7 +116,7 @@ def test_fit_transform_drop_all_nan_columns():
                                         "another_col": [0, 1, 2]}))
 
 
-def test_transform_drop_all_nan_columns():
+def test_simple_imputer_transform_drop_all_nan_columns():
     X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
                       "some_nan": [np.nan, 1, 0],
                       "another_col": [0, 1, 2]})
@@ -125,7 +129,7 @@ def test_transform_drop_all_nan_columns():
                                         "another_col": [0, 1, 2]}))
 
 
-def test_transform_drop_all_nan_columns_empty():
+def test_simple_imputer_transform_drop_all_nan_columns_empty():
     X = pd.DataFrame([[np.nan, np.nan, np.nan]])
     transformer = SimpleImputer(impute_strategy='most_frequent')
     assert transformer.fit_transform(X).empty
@@ -137,7 +141,7 @@ def test_transform_drop_all_nan_columns_empty():
     assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
 
 
-def test_numpy_input():
+def test_simple_imputer_numpy_input():
     X = np.array([[np.nan, 0, 1, np.nan],
                   [np.nan, 2, 3, 2],
                   [np.nan, 2, 3, 0]])
@@ -171,7 +175,7 @@ def test_simple_imputer_fill_value(data_type):
         fill_value = "fill"
         expected = pd.DataFrame({
             "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
-            "object with nan": ["b", "b", "fill", "c", "fill"],
+            "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
         })
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
@@ -226,6 +230,6 @@ def test_simple_imputer_with_none():
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
-                             "boolean with None": [True, True, False, True],
-                             "object with None": ["b", "a", "a", "a"]})
+                             "boolean with None": pd.Series([True, True, False, True], dtype='category'),
+                             "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')})
     assert_frame_equal(transformed, expected, check_dtype=False)
diff --git a/evalml/tests/utils_tests/test_gen_utils.py b/evalml/tests/utils_tests/test_gen_utils.py
index cdb8722159..e9ebc68f55 100644
--- a/evalml/tests/utils_tests/test_gen_utils.py
+++ b/evalml/tests/utils_tests/test_gen_utils.py
@@ -4,10 +4,14 @@
 import numpy as np
 import pandas as pd
 import pytest
+import woodwork as ww
 
 from evalml.pipelines.components import ComponentBase
 from evalml.utils.gen_utils import (
     SEED_BOUNDS,
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    _rename_column_names_to_numeric,
     check_random_state_equality,
     classproperty,
     convert_to_seconds,
@@ -273,3 +277,117 @@ def test_drop_nan(data, expected):
     no_nan_1, no_nan_2 = drop_rows_with_nans(*data)
     _check_equality(no_nan_1, expected[0], check_index_type=False)
     _check_equality(no_nan_2, expected[1], check_index_type=False)
+
+
+def test_rename_column_names_to_numeric():
+    X = np.array([[1, 2], [3, 4]])
+    pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame(X))
+
+    X = pd.DataFrame({"<>": [1, 2], ">>": [2, 4]})
+    pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame({0: [1, 2], 1: [2, 4]}))
+
+    X = ww.DataTable(pd.DataFrame({"<>": [1, 2], ">>": [2, 4]}), logical_types={"<>": "categorical", ">>": "categorical"})
+    X_renamed = _rename_column_names_to_numeric(X)
+    X_expected = pd.DataFrame({0: pd.Series([1, 2], dtype="category"), 1: pd.Series([2, 4], dtype="category")})
+    pd.testing.assert_frame_equal(X_renamed.to_dataframe(), X_expected)
+    assert X_renamed.logical_types == {0: ww.logical_types.Categorical, 1: ww.logical_types.Categorical}
+
+
+def test_convert_woodwork_types_wrapper_with_nan():
+    y = _convert_woodwork_types_wrapper(pd.Series([1, 2, None], dtype="Int64"))
+    pd.testing.assert_series_equal(y, pd.Series([1, 2, np.nan], dtype="float64"))
+
+    y = _convert_woodwork_types_wrapper(pd.array([1, 2, None], dtype="Int64"))
+    pd.testing.assert_series_equal(y, pd.Series([1, 2, np.nan], dtype="float64"))
+
+    y = _convert_woodwork_types_wrapper(pd.Series(["a", "b", None], dtype="string"))
+    pd.testing.assert_series_equal(y, pd.Series(["a", "b", np.nan], dtype="object"))
+
+    y = _convert_woodwork_types_wrapper(pd.array(["a", "b", None], dtype="string"))
+    pd.testing.assert_series_equal(y, pd.Series(["a", "b", np.nan], dtype="object"))
+
+    y = _convert_woodwork_types_wrapper(pd.Series([True, False, None], dtype="boolean"))
+    pd.testing.assert_series_equal(y, pd.Series([True, False, np.nan]))
+
+    y = _convert_woodwork_types_wrapper(pd.array([True, False, None], dtype="boolean"))
+    pd.testing.assert_series_equal(y, pd.Series([True, False, np.nan]))
+
+
+def test_convert_woodwork_types_wrapper():
+    y = _convert_woodwork_types_wrapper(pd.Series([1, 2, 3], dtype="Int64"))
+    pd.testing.assert_series_equal(y, pd.Series([1, 2, 3], dtype="int64"))
+
+    y = _convert_woodwork_types_wrapper(pd.array([1, 2, 3], dtype="Int64"))
+    pd.testing.assert_series_equal(y, pd.Series([1, 2, 3], dtype="int64"))
+
+    y = _convert_woodwork_types_wrapper(pd.Series(["a", "b", "a"], dtype="string"))
+    pd.testing.assert_series_equal(y, pd.Series(["a", "b", "a"], dtype="object"))
+
+    y = _convert_woodwork_types_wrapper(pd.array(["a", "b", "a"], dtype="string"))
+    pd.testing.assert_series_equal(y, pd.Series(["a", "b", "a"], dtype="object"))
+
+    y = _convert_woodwork_types_wrapper(pd.Series([True, False, True], dtype="boolean"))
+    pd.testing.assert_series_equal(y, pd.Series([True, False, True], dtype="bool"))
+
+    y = _convert_woodwork_types_wrapper(pd.array([True, False, True], dtype="boolean"))
+    pd.testing.assert_series_equal(y, pd.Series([True, False, True], dtype="bool"))
+
+
+def test_convert_woodwork_types_wrapper_dataframe():
+    X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"),
+                      "Int array": pd.array([1, 2, 3], dtype="Int64"),
+                      "Int series with nan": pd.Series([1, 2, None], dtype="Int64"),
+                      "Int array with nan": pd.array([1, 2, None], dtype="Int64"),
+                      "string series": pd.Series(["a", "b", "a"], dtype="string"),
+                      "string array": pd.array(["a", "b", "a"], dtype="string"),
+                      "string series with nan": pd.Series(["a", "b", None], dtype="string"),
+                      "string array with nan": pd.array(["a", "b", None], dtype="string"),
+                      "boolean series": pd.Series([True, False, True], dtype="boolean"),
+                      "boolean array": pd.array([True, False, True], dtype="boolean"),
+                      "boolean series with nan": pd.Series([True, False, None], dtype="boolean"),
+                      "boolean array with nan": pd.array([True, False, None], dtype="boolean")
+                      })
+    X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"),
+                               "Int array": pd.array([1, 2, 3], dtype="int64"),
+                               "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"),
+                               "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"),
+                               "string series": pd.Series(["a", "b", "a"], dtype="object"),
+                               "string array": pd.array(["a", "b", "a"], dtype="object"),
+                               "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"),
+                               "string array with nan": pd.array(["a", "b", np.nan], dtype="object"),
+                               "boolean series": pd.Series([True, False, True], dtype="bool"),
+                               "boolean array": pd.array([True, False, True], dtype="bool"),
+                               "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"),
+                               "boolean array with nan": pd.array([True, False, np.nan], dtype="object")
+                               })
+    pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X))
+
+
+def test_convert_to_woodwork_structure():
+    X_dt = ww.DataTable(pd.DataFrame([[1, 2], [3, 4]]))
+    pd.testing.assert_frame_equal(X_dt.to_dataframe(), _convert_to_woodwork_structure(X_dt).to_dataframe())
+
+    X_dc = ww.DataColumn(pd.Series([1, 2, 3, 4]))
+    pd.testing.assert_series_equal(X_dc.to_series(), _convert_to_woodwork_structure(X_dc).to_series())
+
+    X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="Int64"),
+                         1: pd.Series([3, 4], dtype="Int64")})
+    pd.testing.assert_frame_equal(X_pd, _convert_to_woodwork_structure(X_pd).to_dataframe())
+
+    X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
+    pd.testing.assert_series_equal(X_pd, _convert_to_woodwork_structure(X_pd).to_series())
+
+    X_list = [1, 2, 3, 4]
+    X_expected = ww.DataColumn(pd.Series(X_list))
+    pd.testing.assert_series_equal(X_expected.to_series(), _convert_to_woodwork_structure(X_list).to_series())
+    assert X_list == [1, 2, 3, 4]
+
+    X_np = np.array([1, 2, 3, 4])
+    X_expected = ww.DataColumn(pd.Series(X_np))
+    pd.testing.assert_series_equal(X_expected.to_series(), _convert_to_woodwork_structure(X_np).to_series())
+    assert np.array_equal(X_np, np.array([1, 2, 3, 4]))
+
+    X_np = np.array([[1, 2], [3, 4]])
+    X_expected = ww.DataTable(pd.DataFrame(X_np))
+    pd.testing.assert_frame_equal(X_expected.to_dataframe(), _convert_to_woodwork_structure(X_np).to_dataframe())
+    assert np.array_equal(X_np, np.array([[1, 2], [3, 4]]))
diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py
index b76533e8f6..36bdba3281 100644
--- a/evalml/utils/__init__.py
+++ b/evalml/utils/__init__.py
@@ -10,6 +10,7 @@
     jupyter_check,
     safe_repr,
     _convert_woodwork_types_wrapper,
+    _convert_to_woodwork_structure,
     drop_rows_with_nans,
     pad_with_nans
 )
diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py
index 92b87516f9..d04b1c07c1 100644
--- a/evalml/utils/gen_utils.py
+++ b/evalml/utils/gen_utils.py
@@ -210,7 +210,7 @@ def get_importable_subclasses(base_class, used_in_automl=True):
 
 
 def _rename_column_names_to_numeric(X):
-    """Used in XGBoost classifier and regressor classes to rename column names
+    """Used in LightGBM classifier class and XGBoost classifier and regressor classes to rename column names
         when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that XGBoost cannot natively handle.
 
     Arguments:
@@ -219,8 +219,18 @@ def _rename_column_names_to_numeric(X):
     Returns:
         Transformed X where column names are renamed to numerical values
     """
-    name_to_col_num = dict((col, col_num) for col_num, col in enumerate(X.columns.values))
-    return X.rename(columns=name_to_col_num, inplace=False)
+    X_t = X
+    if isinstance(X, np.ndarray):
+        return pd.DataFrame(X)
+    if isinstance(X, ww.DataTable):
+        X_t = X.to_dataframe()
+        logical_types = X.logical_types
+    name_to_col_num = dict((col, col_num) for col_num, col in enumerate(list(X.columns)))
+    X_renamed = X_t.rename(columns=name_to_col_num, inplace=False)
+    if isinstance(X, ww.DataTable):
+        renamed_logical_types = dict((name_to_col_num[col], logical_types[col]) for col in logical_types)
+        return ww.DataTable(X_renamed, logical_types=renamed_logical_types)
+    return X_renamed
 
 
 def jupyter_check():
@@ -277,19 +287,22 @@ def _convert_to_woodwork_structure(data):
     """
     Takes input data structure, and if it is not a Woodwork data structure already, will convert it to a Woodwork DataTable or DataColumn structure.
     """
+    ww_data = data
     if isinstance(data, ww.DataTable) or isinstance(data, ww.DataColumn):
-        return data
+        return ww_data
     # Convert numpy data structures to pandas data structures
     if isinstance(data, list):
-        data = np.array(data)
-    if isinstance(data, pd.api.extensions.ExtensionArray) or (isinstance(data, np.ndarray) and len(data.shape) == 1):
-        data = pd.Series(data)
-    elif isinstance(data, np.ndarray):
-        data = pd.DataFrame(data)
+        ww_data = np.array(data)
+
+    if isinstance(ww_data, pd.api.extensions.ExtensionArray) or (isinstance(ww_data, np.ndarray) and len(ww_data.shape) == 1):
+        ww_data = pd.Series(ww_data)
+    elif isinstance(ww_data, np.ndarray):
+        ww_data = pd.DataFrame(ww_data)
+
     # Convert pandas data structures to Woodwork data structures
-    if isinstance(data, pd.Series):
-        return ww.DataColumn(data)
-    return ww.DataTable(data)
+    if isinstance(ww_data, pd.Series):
+        return ww.DataColumn(ww_data)
+    return ww.DataTable(ww_data, copy_dataframe=True)
 
 
 def _convert_woodwork_types_wrapper(pd_data):
@@ -300,18 +313,30 @@ def _convert_woodwork_types_wrapper(pd_data):
         pd_data (pd.Series, pd.DataFrame, pd.ExtensionArray): Pandas data structure
 
     Returns:
-        New pandas data structure (pd.DataFrame or pd.Series) with original data and dtypes that can be handled by numpy
+        Modified pandas data structure (pd.DataFrame or pd.Series) with original data and dtypes that can be handled by numpy
     """
     nullable_to_numpy_mapping = {pd.Int64Dtype: 'int64',
                                  pd.BooleanDtype: 'bool',
                                  pd.StringDtype: 'object'}
-
-    if isinstance(pd_data, pd.Series) and type(pd_data.dtype) in nullable_to_numpy_mapping:
-        return pd_data.astype(nullable_to_numpy_mapping[type(pd_data.dtype)])
+    nullable_to_numpy_mapping_nan = {pd.Int64Dtype: 'float64',
+                                     pd.BooleanDtype: 'object',
+                                     pd.StringDtype: 'object'}
+
+    if isinstance(pd_data, pd.api.extensions.ExtensionArray):
+        if pd.isna(pd_data).any():
+            return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)])
+        return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)])
+    if (isinstance(pd_data, pd.Series) and type(pd_data.dtype) in nullable_to_numpy_mapping):
+        if pd.isna(pd_data).any():
+            return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)], index=pd_data.index)
+        return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)], index=pd_data.index)
     if isinstance(pd_data, pd.DataFrame):
         for col_name, col in pd_data.iteritems():
             if type(col.dtype) in nullable_to_numpy_mapping:
-                pd_data[col_name] = pd_data[col_name].astype(nullable_to_numpy_mapping[type(col.dtype)])
+                if pd.isna(pd_data[col_name]).any():
+                    pd_data[col_name] = pd.Series(pd_data[col_name].to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data[col_name].dtype)])
+                else:
+                    pd_data[col_name] = pd_data[col_name].astype(nullable_to_numpy_mapping[type(col.dtype)])
     return pd_data