Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow users to set feature types without having to learn about woodwork directly #1555

Merged
merged 20 commits into from Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/api_reference.rst
Expand Up @@ -538,4 +538,4 @@ General Utils
get_random_seed
pad_with_nans
drop_rows_with_nans

infer_feature_types
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -17,6 +17,7 @@ Release Notes
* Added more information to users about ensembling behavior in ``AutoMLSearch`` :pr:`1527`
* Add woodwork support for more utility and graph methods :pr:`1544`
* Changed ``DateTimeFeaturizer`` to encode features as int :pr:`1479`
* Added utility method so that users can set feature types without having to learn about Woodwork directly :pr:`1555`
* Added Linear Discriminant Analysis transformer for dimensionality reduction :pr:`1331`
* Added multiclass support for ``partial_dependence`` and ``graph_partial_dependence`` :pr:`1554`
* Added ``TimeSeriesBinaryClassificationPipeline`` and ``TimeSeriesMulticlassClassificationPipeline`` classes :pr:`1528`
Expand Down
17 changes: 13 additions & 4 deletions docs/source/user_guide/automl.ipynb
Expand Up @@ -47,7 +47,7 @@
"source": [
"__Note:__ To provide data to EvalML, it is recommended that you create a `DataTable` object using [the Woodwork project](https://woodwork.alteryx.com/en/stable/).\n",
"\n",
"If a pandas `DataFrame` is provided for the input features, EvalML will convert it to a Woodwork `DataTable` under the hood, running additional inference logic to detect the type of each feature, most notably detecting if a categorical feature should be treated as a text feature instead. If you'd like to override Woodwork's inference, providing a `DataTable` as input makes it easy to control how EvalML will treat each feature, as a numeric feature, a categorical feature, a text feature or other type of feature."
"EvalML also accepts ``pandas`` input, and will run type inference on top of the input ``pandas`` data. If you\u2019d like to change the types inferred by EvalML, you can use the `infer_feature_types` utility method as follows. The `infer_feature_types` utility method takes pandas or numpy input and converts it to a Woodwork data structure. It takes in a `feature_types` parameter which can be used to specify what types specific columns should be. In the example below, we specify that the provider, which would have otherwise been inferred as a column with natural language, is a categorical column."
]
},
{
Expand All @@ -57,8 +57,17 @@
"outputs": [],
"source": [
"import evalml\n",
"\n",
"X, y = evalml.demos.load_breast_cancer()\n",
"from evalml.utils import infer_feature_types\n",
"X, y = evalml.demos.load_fraud(return_pandas=True)\n",
"X = infer_feature_types(X, feature_types={'provider': 'categorical'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl = evalml.automl.AutoMLSearch(X_train=X, y_train=y, problem_type='binary')\n",
"automl.search()"
]
Expand Down Expand Up @@ -381,4 +390,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
33 changes: 33 additions & 0 deletions evalml/tests/utils_tests/test_gen_utils.py
Expand Up @@ -22,6 +22,7 @@
get_random_seed,
get_random_state,
import_or_raise,
infer_feature_types,
jupyter_check,
pad_with_nans,
save_plot
Expand Down Expand Up @@ -398,6 +399,38 @@ def test_convert_to_woodwork_structure():
assert np.array_equal(X_np, np.array([[1, 2], [3, 4]]))


def test_infer_feature_types_dataframe():
X_pd = pd.DataFrame({0: pd.Series([1, 2]),
1: pd.Series([3, 4])})
pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe(), check_dtype=False)

X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="Int64"),
1: pd.Series([3, 4], dtype="Int64")})
pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe())

X_expected = X_pd.copy()
X_expected[0] = X_expected[0].astype("category")
pd.testing.assert_frame_equal(X_expected, infer_feature_types(X_pd, {0: "categorical"}).to_dataframe())
pd.testing.assert_frame_equal(X_expected, infer_feature_types(X_pd, {0: ww.logical_types.Categorical}).to_dataframe())


def test_infer_feature_types_series():
X_pd = pd.Series([1, 2, 3, 4])
X_expected = X_pd.astype("Int64")
pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd).to_series())

X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd).to_series())

X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
X_expected = X_pd.astype("category")
pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, "categorical").to_series())

X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
X_expected = X_pd.astype("category")
pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, ww.logical_types.Categorical).to_series())


@pytest.mark.parametrize("file_name,format,interactive",
[
('test_plot', 'png', False),
Expand Down
1 change: 1 addition & 0 deletions evalml/utils/__init__.py
Expand Up @@ -13,6 +13,7 @@
_convert_to_woodwork_structure,
drop_rows_with_nans,
pad_with_nans,
infer_feature_types,
_get_rows_without_nans
)
from .cli_utils import print_info, get_evalml_root, get_installed_packages, get_sys_info, print_sys_info, print_deps
22 changes: 22 additions & 0 deletions evalml/utils/gen_utils.py
Expand Up @@ -286,6 +286,28 @@ def is_all_numeric(df):
return True


def infer_feature_types(data, feature_types=None):
"""Create a Woodwork structure from the given pandas or numpy input, with specified types for columns.
If a column's type is not specified, it will be inferred by Woodwork.

Arguments:
data (pd.DataFrame): Input data to convert to a Woodwork data structure.
feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary
mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be
a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage")

Returns:
A Woodwork data structure where the data type of each column was either specified or inferred.
"""
ww_data = _convert_to_woodwork_structure(data)
if feature_types is not None:
if len(ww_data.shape) == 1:
ww_data = ww_data.set_logical_type(feature_types)
else:
ww_data = ww_data.set_types(logical_types=feature_types)
return ww_data


def _convert_to_woodwork_structure(data):
"""
Takes input data structure, and if it is not a Woodwork data structure already, will convert it to a Woodwork DataTable or DataColumn structure.
Expand Down