-
Notifications
You must be signed in to change notification settings - Fork 83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ReplaceNullableTypes Component #3090
Changes from all commits
9bfec0b
fab0eaa
7214263
e71ee23
d6ed0db
e8c2b80
ed0289c
cfbb0da
09d66e9
d771359
11feebf
d795360
388ba77
3b2c369
9064692
d1983a6
a9a42a5
6b4e906
749c862
03ccb38
a8b0446
5a8feff
f1d61cb
b2417ac
7906935
8b2cc0e
280531d
c5ceb6a
511903f
3f38c54
3567f83
38db0cb
1458f85
8ea2cce
168fa5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,4 +30,5 @@ | |
EmailFeaturizer, | ||
URLFeaturizer, | ||
DropRowsTransformer, | ||
ReplaceNullableTypes, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
"""Transformer to replace features with the new nullable dtypes with a dtype that is compatible in EvalML.""" | ||
from woodwork import init_series | ||
from woodwork.logical_types import BooleanNullable, IntegerNullable | ||
|
||
from evalml.pipelines.components.transformers import Transformer | ||
from evalml.utils import infer_feature_types | ||
|
||
|
||
class ReplaceNullableTypes(Transformer): | ||
"""Transformer to replace features with the new nullable dtypes with a dtype that is compatible in EvalML.""" | ||
|
||
name = "Replace Nullable Types Transformer" | ||
hyperparameter_ranges = {} | ||
modifies_target = True | ||
"""{}""" | ||
|
||
chukarsten marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def __init__(self, random_seed=0, **kwargs): | ||
parameters = {} | ||
parameters.update(kwargs) | ||
|
||
self._nullable_int_cols = [] | ||
self._nullable_bool_cols = [] | ||
self._nullable_target = None | ||
super().__init__( | ||
parameters=parameters, component_obj=None, random_seed=random_seed | ||
) | ||
|
||
def fit(self, X, y=None): | ||
"""Fits component to data. | ||
|
||
Args: | ||
X (pd.DataFrame): The input training data of shape [n_samples, n_features]. | ||
y (pd.Series, optional): The target training data of length [n_samples]. | ||
|
||
Returns: | ||
self | ||
""" | ||
X_t = infer_feature_types(X, ignore_nullable_types=True) | ||
chukarsten marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._nullable_int_cols = list( | ||
X_t.ww.select( | ||
["IntegerNullable", "AgeNullable"], return_schema=True | ||
).columns | ||
) | ||
self._nullable_bool_cols = list( | ||
X_t.ww.select(["BooleanNullable"], return_schema=True).columns | ||
) | ||
|
||
chukarsten marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if y is None: | ||
self._nullable_target = None | ||
else: | ||
y = infer_feature_types(y, ignore_nullable_types=True) | ||
if isinstance(y.ww.logical_type, IntegerNullable): | ||
self._nullable_target = "nullable_int" | ||
elif isinstance(y.ww.logical_type, BooleanNullable): | ||
self._nullable_target = "nullable_bool" | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
"""Transforms data by replacing columns that contain nullable types with the appropriate replacement type. | ||
|
||
"float64" for nullable integers and "category" for nullable booleans. | ||
|
||
Args: | ||
X (pd.DataFrame): Data to transform | ||
y (pd.Series, optional): Target data to transform | ||
|
||
Returns: | ||
pd.DataFrame: Transformed X | ||
pd.Series: Transformed y | ||
""" | ||
X_t = infer_feature_types(X, ignore_nullable_types=True) | ||
for col in self._nullable_int_cols: | ||
X_t.ww[col] = init_series(X_t[col], logical_type="double") | ||
for col in self._nullable_bool_cols: | ||
X_t.ww[col] = init_series(X_t[col], logical_type="categorical") | ||
|
||
if y is not None: | ||
y_t = infer_feature_types(y, ignore_nullable_types=True) | ||
if self._nullable_target is not None: | ||
if self._nullable_target == "nullable_int": | ||
y_t = init_series(y_t, logical_type="double") | ||
elif self._nullable_target == "nullable_bool": | ||
y_t = init_series(y_t, logical_type="categorical") | ||
elif y is None: | ||
y_t = None | ||
|
||
return X_t, y_t | ||
|
||
def fit_transform(self, X, y=None): | ||
"""Substitutes non-nullable types for the new pandas nullable types in the data and target data. | ||
|
||
Args: | ||
X (pd.DataFrame, optional): Input features. | ||
y (pd.Series): Target data. | ||
|
||
Returns: | ||
tuple of pd.DataFrame, pd.Series: The input features and target data with the non-nullable types set. | ||
""" | ||
X_ww = infer_feature_types(X, ignore_nullable_types=True) | ||
if y is not None: | ||
y_ww = infer_feature_types(y, ignore_nullable_types=True) | ||
else: | ||
y_ww = y | ||
return self.fit(X_ww, y_ww).transform(X_ww, y_ww) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,6 +50,7 @@ | |
ProphetRegressor, | ||
RandomForestClassifier, | ||
RandomForestRegressor, | ||
ReplaceNullableTypes, | ||
RFClassifierSelectFromModel, | ||
RFRegressorSelectFromModel, | ||
SelectByType, | ||
|
@@ -831,7 +832,8 @@ def test_component_has_random_seed(): | |
assert "random_seed" in params | ||
|
||
|
||
def test_transformer_transform_output_type(X_y_binary): | ||
@pytest.mark.parametrize("component_class", _all_transformers()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just modified this test to move the for-loop to pytest parameters. Made it easier on test failure to understand which transformer component was failing. |
||
def test_transformer_transform_output_type(component_class, X_y_binary): | ||
X_np, y_np = X_y_binary | ||
assert isinstance(X_np, np.ndarray) | ||
assert isinstance(y_np, np.ndarray) | ||
|
@@ -850,95 +852,91 @@ def test_transformer_transform_output_type(X_y_binary): | |
(X_df_with_col_names, y_series_with_name, X_df_with_col_names.columns), | ||
] | ||
|
||
for component_class in _all_transformers(): | ||
if component_class in [PolynomialDetrender, LogTransformer, LabelEncoder]: | ||
# Skipping because these tests are handled in their respective test files | ||
continue | ||
print("Testing transformer {}".format(component_class.name)) | ||
for X, y, X_cols_expected in datatype_combos: | ||
print( | ||
'Checking output of transform for transformer "{}" on X type {} cols {}, y type {} name {}'.format( | ||
component_class.name, | ||
type(X), | ||
X.columns if isinstance(X, pd.DataFrame) else None, | ||
type(y), | ||
y.name if isinstance(y, pd.Series) else None, | ||
) | ||
if component_class in [PolynomialDetrender, LogTransformer, LabelEncoder]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should all be de-indenting... |
||
pytest.skip( | ||
"Skipping because these tests are handled in their respective test files" | ||
) | ||
print("Testing transformer {}".format(component_class.name)) | ||
for X, y, X_cols_expected in datatype_combos: | ||
print( | ||
'Checking output of transform for transformer "{}" on X type {} cols {}, y type {} name {}'.format( | ||
component_class.name, | ||
type(X), | ||
X.columns if isinstance(X, pd.DataFrame) else None, | ||
type(y), | ||
y.name if isinstance(y, pd.Series) else None, | ||
) | ||
) | ||
|
||
component = component_class() | ||
# SMOTE will throw an error if we pass a ratio lower than the current class balance | ||
if "Oversampler" == component_class.name: | ||
# we cover this case in test_oversamplers | ||
continue | ||
elif component_class == TimeSeriesFeaturizer: | ||
# covered in test_delayed_feature_transformer.py | ||
continue | ||
|
||
component.fit(X, y=y) | ||
transform_output = component.transform(X, y=y) | ||
|
||
if component.modifies_target: | ||
assert isinstance(transform_output[0], pd.DataFrame) | ||
assert isinstance(transform_output[1], pd.Series) | ||
else: | ||
assert isinstance(transform_output, pd.DataFrame) | ||
component = component_class() | ||
# SMOTE will throw an error if we pass a ratio lower than the current class balance | ||
if "Oversampler" == component_class.name: | ||
# we cover this case in test_oversamplers | ||
continue | ||
elif component_class == TimeSeriesFeaturizer: | ||
# covered in test_delayed_feature_transformer.py | ||
continue | ||
|
||
if isinstance(component, SelectColumns) or isinstance( | ||
component, SelectByType | ||
): | ||
assert transform_output.shape == (X.shape[0], 0) | ||
elif isinstance(component, RFRegressorSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, RFClassifierSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, PCA) or isinstance( | ||
component, LinearDiscriminantAnalysis | ||
): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] <= X.shape[1] | ||
elif isinstance(component, DFSTransformer): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] >= X.shape[1] | ||
elif component.modifies_target: | ||
assert transform_output[0].shape == X.shape | ||
assert transform_output[1].shape[0] == X.shape[0] | ||
assert len(transform_output[1].shape) == 1 | ||
else: | ||
assert transform_output.shape == X.shape | ||
assert list(transform_output.columns) == list(X_cols_expected) | ||
component.fit(X, y=y) | ||
transform_output = component.transform(X, y=y) | ||
|
||
transform_output = component.fit_transform(X, y=y) | ||
if component.modifies_target: | ||
assert isinstance(transform_output[0], pd.DataFrame) | ||
assert isinstance(transform_output[1], pd.Series) | ||
else: | ||
assert isinstance(transform_output, pd.DataFrame) | ||
if component.modifies_target: | ||
assert isinstance(transform_output[0], pd.DataFrame) | ||
assert isinstance(transform_output[1], pd.Series) | ||
else: | ||
assert isinstance(transform_output, pd.DataFrame) | ||
|
||
if isinstance(component, SelectColumns) or isinstance(component, SelectByType): | ||
assert transform_output.shape == (X.shape[0], 0) | ||
elif isinstance(component, RFRegressorSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, RFClassifierSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, PCA) or isinstance( | ||
component, LinearDiscriminantAnalysis | ||
): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] <= X.shape[1] | ||
elif isinstance(component, DFSTransformer): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] >= X.shape[1] | ||
elif component.modifies_target: | ||
assert transform_output[0].shape == X.shape | ||
assert transform_output[1].shape[0] == X.shape[0] | ||
assert len(transform_output[1].shape) == 1 | ||
else: | ||
assert transform_output.shape == X.shape | ||
assert list(transform_output.columns) == list(X_cols_expected) | ||
|
||
if isinstance(component, SelectColumns) or isinstance( | ||
component, SelectByType | ||
): | ||
assert transform_output.shape == (X.shape[0], 0) | ||
elif isinstance(component, RFRegressorSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, RFClassifierSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, PCA) or isinstance( | ||
component, LinearDiscriminantAnalysis | ||
): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] <= X.shape[1] | ||
elif isinstance(component, DFSTransformer): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] >= X.shape[1] | ||
elif component.modifies_target: | ||
assert transform_output[0].shape == X.shape | ||
assert transform_output[1].shape[0] == X.shape[0] | ||
assert len(transform_output[1].shape) == 1 | ||
transform_output = component.fit_transform(X, y=y) | ||
if component.modifies_target: | ||
assert isinstance(transform_output[0], pd.DataFrame) | ||
assert isinstance(transform_output[1], pd.Series) | ||
else: | ||
assert isinstance(transform_output, pd.DataFrame) | ||
|
||
if isinstance(component, SelectColumns) or isinstance(component, SelectByType): | ||
assert transform_output.shape == (X.shape[0], 0) | ||
elif isinstance(component, RFRegressorSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, RFClassifierSelectFromModel): | ||
assert transform_output.shape == (X.shape[0], 10) | ||
elif isinstance(component, PCA) or isinstance( | ||
component, LinearDiscriminantAnalysis | ||
): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] <= X.shape[1] | ||
elif isinstance(component, DFSTransformer): | ||
assert transform_output.shape[0] == X.shape[0] | ||
assert transform_output.shape[1] >= X.shape[1] | ||
elif component.modifies_target: | ||
assert transform_output[0].shape == X.shape | ||
assert transform_output[1].shape[0] == X.shape[0] | ||
assert len(transform_output[1].shape) == 1 | ||
|
||
else: | ||
assert transform_output.shape == X.shape | ||
assert list(transform_output.columns) == list(X_cols_expected) | ||
else: | ||
assert transform_output.shape == X.shape | ||
assert list(transform_output.columns) == list(X_cols_expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
@@ -1662,7 +1660,8 @@ def test_component_modifies_feature_or_target(): | |
if ( | ||
issubclass(component_class, BaseSampler) | ||
or hasattr(component_class, "inverse_transform") | ||
or component_class in [TargetImputer, DropRowsTransformer] | ||
or component_class | ||
in [TargetImputer, DropRowsTransformer, ReplaceNullableTypes] | ||
): | ||
assert component_class.modifies_target | ||
else: | ||
|
@@ -1680,7 +1679,8 @@ def test_component_parameters_supported_by_list_API(): | |
if ( | ||
issubclass(component_class, BaseSampler) | ||
or hasattr(component_class, "inverse_transform") | ||
or component_class in [TargetImputer, DropRowsTransformer] | ||
or component_class | ||
in [TargetImputer, DropRowsTransformer, ReplaceNullableTypes] | ||
): | ||
assert not component_class._supported_by_list_API | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A convenience to help people debug the minimum dependency checks we have in CI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should use this in the CI job? Maybe in a follow-up pr.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is already done in the github workflows, specifically the min deps CI workflows. I just put it in here so people can build a test environment to run them. If there's a better way that people do it, I can take this out!