Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update DateTimeFormatDataCheck with actions and make pipeline from actions #3454

Merged
merged 16 commits into from Apr 14, 2022
Merged
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -3,6 +3,7 @@

**Future Releases**
* Enhancements
* Updated ``make_pipeline_from_data_check_output`` to work with time series problems. :pr:`3454`
* Added ``TimeSeriesImputer`` component :pr:`3374`
* Replaced ``pipeline_parameters`` and ``custom_hyperparameters`` with ``search_parameters`` in ``AutoMLSearch`` :pr:`3373`, :pr:`3427`
* Added ``TimeSeriesRegularizer`` to smooth uninferrable date ranges for time series problems :pr:`3376`
Expand Down
4 changes: 4 additions & 0 deletions evalml/data_checks/data_check_action_code.py
Expand Up @@ -19,6 +19,9 @@ class DataCheckActionCode(Enum):
TRANSFORM_TARGET = "transform_target"
"""Action code for transforming the target data."""

REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset"
"""Action code for regularizing and imputing all features and target time series data."""

@classproperty
def _all_values(cls):
return {code.value.upper(): code for code in list(cls)}
Expand All @@ -30,5 +33,6 @@ def __str__(self):
DataCheckActionCode.DROP_ROWS.name: "drop_rows",
DataCheckActionCode.IMPUTE_COL.name: "impute_col",
DataCheckActionCode.TRANSFORM_TARGET.name: "transform_target",
DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET.name: "regularize_and_impute_dataset",
}
return datacheck_action_code_dict[self.name]
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_message_code.py
Expand Up @@ -109,6 +109,9 @@ class DataCheckMessageCode(Enum):
DATETIME_INFORMATION_NOT_FOUND = "datetime_information_not_found"
"""Message code for when datetime information can not be found or is in an unaccepted format."""

DATETIME_NO_FREQUENCY_INFERRED = "datetime_no_frequency_inferred"
"""Message code for when no frequency can be inferred in the datetime values."""

DATETIME_HAS_UNEVEN_INTERVALS = "datetime_has_uneven_intervals"
"""Message code for when the datetime values have uneven intervals."""

Expand Down
214 changes: 209 additions & 5 deletions evalml/data_checks/datetime_format_data_check.py
Expand Up @@ -2,7 +2,14 @@
import pandas as pd
from woodwork.statistics_utils import infer_frequency

from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode
from evalml.data_checks import (
DataCheck,
DataCheckActionCode,
DataCheckActionOption,
DataCheckError,
DataCheckMessageCode,
DCAOParameterType,
)
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -43,7 +50,7 @@ def validate(self, X, y):
... "message": "No frequency could be detected in column 'dates', possibly due to uneven intervals.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "code": "DATETIME_NO_FREQUENCY_INFERRED",
... "details": {"columns": None, "rows": None},
... "action_options": []
... }
Expand All @@ -53,6 +60,7 @@ def validate(self, X, y):

>>> X = pd.DataFrame(pd.date_range("2021-01-01", periods=9).append(pd.date_range("2021-01-31", periods=50)), columns=["dates"])
>>> y = pd.Series([0, 1, 0, 1, 1, 0, 0, 0, 1, 0])
>>> ww_payload = infer_frequency(X["dates"], debug=True, window_length=5, threshold=0.8)
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="dates")
>>> assert datetime_format_dc.validate(X, y) == [
... {
Expand All @@ -62,13 +70,44 @@ def validate(self, X, y):
... "code": "DATETIME_IS_MISSING_VALUES",
... "details": {"columns": None, "rows": None},
... "action_options": []
... }
... },
... {
... "message": "A frequency was detected in column 'dates', but there are faulty datetime values that need to be addressed.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {'columns': None, 'rows': None},
... "action_options": [
... {
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET',
... 'data_check_name': 'DateTimeFormatDataCheck',
... 'metadata': {
... 'columns': None,
... 'is_target': True,
... 'rows': None
... },
... 'parameters': {
... 'time_index': {
... 'default_value': 'dates',
... 'parameter_type': 'global',
... 'type': 'str'
... },
... 'frequency_payload': {
... 'default_value': ww_payload,
... 'parameter_type': 'global',
... 'type': 'tuple'
... }
... }
... }
... ]
... }
... ]

The column "dates" has a repeat of the date 2021-01-09 appended to the end, which is considered redundant and will raise an error.

>>> X = pd.DataFrame(pd.date_range("2021-01-01", periods=9).append(pd.date_range("2021-01-09", periods=1)), columns=["dates"])
>>> y = pd.Series([0, 1, 0, 1, 1, 0, 0, 0, 1, 0])
>>> ww_payload = infer_frequency(X["dates"], debug=True, window_length=5, threshold=0.8)
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="dates")
>>> assert datetime_format_dc.validate(X, y) == [
... {
Expand All @@ -78,12 +117,43 @@ def validate(self, X, y):
... "code": "DATETIME_HAS_REDUNDANT_ROW",
... "details": {"columns": None, "rows": None},
... "action_options": []
... }
... },
... {
... "message": "A frequency was detected in column 'dates', but there are faulty datetime values that need to be addressed.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {'columns': None, 'rows': None},
... "action_options": [
... {
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET',
... 'data_check_name': 'DateTimeFormatDataCheck',
... 'metadata': {
... 'columns': None,
... 'is_target': True,
... 'rows': None
... },
... 'parameters': {
... 'time_index': {
... 'default_value': 'dates',
... 'parameter_type': 'global',
... 'type': 'str'
... },
... 'frequency_payload': {
... 'default_value': ww_payload,
... 'parameter_type': 'global',
... 'type': 'tuple'
... }
... }
... }
... ]
... }
... ]

The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned.

>>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"])
>>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8)
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks")
>>> assert datetime_format_dc.validate(X, y) == [
... {
Expand All @@ -93,7 +163,83 @@ def validate(self, X, y):
... "details": {"columns": None, "rows": None},
... "code": "DATETIME_HAS_MISALIGNED_VALUES",
... "action_options": []
... }
... },
... {
... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {'columns': None, 'rows': None},
... "action_options": [
... {
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET',
... 'data_check_name': 'DateTimeFormatDataCheck',
... 'metadata': {
... 'columns': None,
... 'is_target': True,
... 'rows': None
... },
... 'parameters': {
... 'time_index': {
... 'default_value': 'Weeks',
... 'parameter_type': 'global',
... 'type': 'str'
... },
... 'frequency_payload': {
... 'default_value': ww_payload,
... 'parameter_type': 'global',
... 'type': 'tuple'
... }
... }
... }
... ]
... }
... ]

The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned.

>>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"])
>>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8)
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks")
>>> assert datetime_format_dc.validate(X, y) == [
... {
... "message": "Column 'Weeks' has datetime values that do not align with the inferred frequency.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "details": {"columns": None, "rows": None},
... "code": "DATETIME_HAS_MISALIGNED_VALUES",
... "action_options": []
... },
... {
... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {'columns': None, 'rows': None},
... "action_options": [
... {
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET',
... 'data_check_name': 'DateTimeFormatDataCheck',
... 'metadata': {
... 'columns': None,
... 'is_target': True,
... 'rows': None
... },
... 'parameters': {
... 'time_index': {
... 'default_value': 'Weeks',
... 'parameter_type': 'global',
... 'type': 'str'
... },
... 'frequency_payload': {
... 'default_value': ww_payload,
... 'parameter_type': 'global',
... 'type': 'tuple'
... }
... }
... }
... ]
... }
... ]

The column "Weeks" passed integers instead of datetime data, which will raise an error.
Expand Down Expand Up @@ -154,6 +300,7 @@ def validate(self, X, y):
... ["2-12-21", "3-12-21"]]
>>> dates[0][0] = None
>>> df = pd.DataFrame(dates, columns=["days", "days2"])
>>> ww_payload = infer_frequency(pd.to_datetime(df["days"]), debug=True, window_length=5, threshold=0.8)
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="days")
>>> assert datetime_format_dc.validate(df, y) == [
... {
Expand All @@ -163,6 +310,36 @@ def validate(self, X, y):
... "details": {"columns": None, "rows": None},
... "code": "DATETIME_HAS_NAN",
... "action_options": []
... },
... {
... "message": "A frequency was detected in column 'days', but there are faulty datetime values that need to be addressed.",
... "data_check_name": "DateTimeFormatDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {'columns': None, 'rows': None},
... "action_options": [
... {
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET',
... 'data_check_name': 'DateTimeFormatDataCheck',
... 'metadata': {
... 'columns': None,
... 'is_target': True,
... 'rows': None
... },
... 'parameters': {
... 'time_index': {
... 'default_value': 'days',
... 'parameter_type': 'global',
... 'type': 'str'
... },
... 'frequency_payload': {
... 'default_value': ww_payload,
... 'parameter_type': 'global',
... 'type': 'tuple'
... }
... }
... }
... ]
... }
... ]
...
Expand Down Expand Up @@ -270,7 +447,34 @@ def validate(self, X, y):
DataCheckError(
message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals.",
data_check_name=self.name,
message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED,
).to_dict()
)
else:
messages.append(
DataCheckError(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're adding this new error instead of adding it to everyone of the already existing data check errors to avoid having duplicate data check actions right?

I think this may be confusing UX to users because they'll see multiple errors but only the "DATETIME_HAS_UNEVEN_INTERVALS" will appear "fixable" via an action even though this action will fix all other errors.

This may be the best we can do for now. Tagging @Cmancuso so we can discuss further.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ParthivNaresh and I talked about this - errors will be consolidated in the future.

message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.",
data_check_name=self.name,
message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS,
action_options=[
DataCheckActionOption(
DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET,
data_check_name=self.name,
parameters={
"time_index": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "str",
"default_value": col_name,
},
"frequency_payload": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "tuple",
"default_value": ww_payload,
},
},
metadata={"is_target": True},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're not using is_target anywhere right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An EvalML consumer might check for is_target when running data check actions to determine if the target has been passed and to raise an error if it hasn't when the target is being modified. I felt like that case needed to be covered but if it doesn't I have no problem taking that out.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to keep it! just wondering why since it didn't see it being "used"

)
],
).to_dict()
)

Expand Down
4 changes: 2 additions & 2 deletions evalml/data_checks/default_data_checks.py
Expand Up @@ -40,8 +40,8 @@ class DefaultDataChecks(DataChecks):
problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass.
objective (str or ObjectiveBase): Name or instance of the objective class.
n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3.
datetime_column (str): The name of the column containing datetime information to be used for time series problems.
Default to "index" indicating that the datetime information is in the index of X or y.
problem_configuration (dict): Required for time series problem types. Values should be passed in for time_index,
gap, forecast_horizon, and max_delay.
"""

_DEFAULT_DATA_CHECK_CLASSES = [
Expand Down