New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update DateTimeFormatDataCheck with actions and make pipeline from actions #3454
Changes from 8 commits
a40a9d8
81bb862
d84e8ac
6f30b96
3ab5932
03b1ad6
f687fd5
1d4d00b
37d08c5
0eb1df0
dbdecc4
599f2d6
132863f
a61abd4
3bfc669
9b17265
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,14 @@ | |
import pandas as pd | ||
from woodwork.statistics_utils import infer_frequency | ||
|
||
from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode | ||
from evalml.data_checks import ( | ||
DataCheck, | ||
DataCheckActionCode, | ||
DataCheckActionOption, | ||
DataCheckError, | ||
DataCheckMessageCode, | ||
DCAOParameterType, | ||
) | ||
from evalml.utils import infer_feature_types | ||
|
||
|
||
|
@@ -43,7 +50,7 @@ def validate(self, X, y): | |
... "message": "No frequency could be detected in column 'dates', possibly due to uneven intervals.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "code": "DATETIME_NO_FREQUENCY_INFERRED", | ||
... "details": {"columns": None, "rows": None}, | ||
... "action_options": [] | ||
... } | ||
|
@@ -53,6 +60,7 @@ def validate(self, X, y): | |
|
||
>>> X = pd.DataFrame(pd.date_range("2021-01-01", periods=9).append(pd.date_range("2021-01-31", periods=50)), columns=["dates"]) | ||
>>> y = pd.Series([0, 1, 0, 1, 1, 0, 0, 0, 1, 0]) | ||
>>> ww_payload = infer_frequency(X["dates"], debug=True, window_length=5, threshold=0.8) | ||
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="dates") | ||
>>> assert datetime_format_dc.validate(X, y) == [ | ||
... { | ||
|
@@ -62,13 +70,44 @@ def validate(self, X, y): | |
... "code": "DATETIME_IS_MISSING_VALUES", | ||
... "details": {"columns": None, "rows": None}, | ||
... "action_options": [] | ||
... } | ||
... }, | ||
... { | ||
... "message": "A frequency was detected in column 'dates', but there are faulty datetime values that need to be addressed.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "details": {'columns': None, 'rows': None}, | ||
... "action_options": [ | ||
... { | ||
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', | ||
... 'data_check_name': 'DateTimeFormatDataCheck', | ||
... 'metadata': { | ||
... 'columns': None, | ||
... 'is_target': True, | ||
... 'rows': None | ||
... }, | ||
... 'parameters': { | ||
... 'time_index': { | ||
... 'default_value': 'dates', | ||
... 'parameter_type': 'global', | ||
... 'type': 'str' | ||
... }, | ||
... 'frequency_payload': { | ||
... 'default_value': ww_payload, | ||
... 'parameter_type': 'global', | ||
... 'type': 'tuple' | ||
... } | ||
... } | ||
... } | ||
... ] | ||
... } | ||
... ] | ||
|
||
The column "dates" has a repeat of the date 2021-01-09 appended to the end, which is considered redundant and will raise an error. | ||
|
||
>>> X = pd.DataFrame(pd.date_range("2021-01-01", periods=9).append(pd.date_range("2021-01-09", periods=1)), columns=["dates"]) | ||
>>> y = pd.Series([0, 1, 0, 1, 1, 0, 0, 0, 1, 0]) | ||
>>> ww_payload = infer_frequency(X["dates"], debug=True, window_length=5, threshold=0.8) | ||
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="dates") | ||
>>> assert datetime_format_dc.validate(X, y) == [ | ||
... { | ||
|
@@ -78,12 +117,43 @@ def validate(self, X, y): | |
... "code": "DATETIME_HAS_REDUNDANT_ROW", | ||
... "details": {"columns": None, "rows": None}, | ||
... "action_options": [] | ||
... } | ||
... }, | ||
... { | ||
... "message": "A frequency was detected in column 'dates', but there are faulty datetime values that need to be addressed.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "details": {'columns': None, 'rows': None}, | ||
... "action_options": [ | ||
... { | ||
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', | ||
... 'data_check_name': 'DateTimeFormatDataCheck', | ||
... 'metadata': { | ||
... 'columns': None, | ||
... 'is_target': True, | ||
... 'rows': None | ||
... }, | ||
... 'parameters': { | ||
... 'time_index': { | ||
... 'default_value': 'dates', | ||
... 'parameter_type': 'global', | ||
... 'type': 'str' | ||
... }, | ||
... 'frequency_payload': { | ||
... 'default_value': ww_payload, | ||
... 'parameter_type': 'global', | ||
... 'type': 'tuple' | ||
... } | ||
... } | ||
... } | ||
... ] | ||
... } | ||
... ] | ||
|
||
The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned. | ||
|
||
>>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"]) | ||
>>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8) | ||
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks") | ||
>>> assert datetime_format_dc.validate(X, y) == [ | ||
... { | ||
|
@@ -93,7 +163,83 @@ def validate(self, X, y): | |
... "details": {"columns": None, "rows": None}, | ||
... "code": "DATETIME_HAS_MISALIGNED_VALUES", | ||
... "action_options": [] | ||
... } | ||
... }, | ||
... { | ||
... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "details": {'columns': None, 'rows': None}, | ||
... "action_options": [ | ||
... { | ||
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', | ||
... 'data_check_name': 'DateTimeFormatDataCheck', | ||
... 'metadata': { | ||
... 'columns': None, | ||
... 'is_target': True, | ||
... 'rows': None | ||
... }, | ||
... 'parameters': { | ||
... 'time_index': { | ||
... 'default_value': 'Weeks', | ||
... 'parameter_type': 'global', | ||
... 'type': 'str' | ||
... }, | ||
... 'frequency_payload': { | ||
... 'default_value': ww_payload, | ||
... 'parameter_type': 'global', | ||
... 'type': 'tuple' | ||
... } | ||
... } | ||
... } | ||
... ] | ||
... } | ||
... ] | ||
|
||
The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned. | ||
|
||
>>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"]) | ||
>>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8) | ||
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks") | ||
>>> assert datetime_format_dc.validate(X, y) == [ | ||
... { | ||
... "message": "Column 'Weeks' has datetime values that do not align with the inferred frequency.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "details": {"columns": None, "rows": None}, | ||
... "code": "DATETIME_HAS_MISALIGNED_VALUES", | ||
... "action_options": [] | ||
... }, | ||
... { | ||
... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "details": {'columns': None, 'rows': None}, | ||
... "action_options": [ | ||
... { | ||
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', | ||
... 'data_check_name': 'DateTimeFormatDataCheck', | ||
... 'metadata': { | ||
... 'columns': None, | ||
... 'is_target': True, | ||
... 'rows': None | ||
... }, | ||
... 'parameters': { | ||
... 'time_index': { | ||
... 'default_value': 'Weeks', | ||
... 'parameter_type': 'global', | ||
... 'type': 'str' | ||
... }, | ||
... 'frequency_payload': { | ||
... 'default_value': ww_payload, | ||
... 'parameter_type': 'global', | ||
... 'type': 'tuple' | ||
... } | ||
... } | ||
... } | ||
... ] | ||
... } | ||
... ] | ||
|
||
The column "Weeks" passed integers instead of datetime data, which will raise an error. | ||
|
@@ -154,6 +300,7 @@ def validate(self, X, y): | |
... ["2-12-21", "3-12-21"]] | ||
>>> dates[0][0] = None | ||
>>> df = pd.DataFrame(dates, columns=["days", "days2"]) | ||
>>> ww_payload = infer_frequency(pd.to_datetime(df["days"]), debug=True, window_length=5, threshold=0.8) | ||
>>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="days") | ||
>>> assert datetime_format_dc.validate(df, y) == [ | ||
... { | ||
|
@@ -163,6 +310,36 @@ def validate(self, X, y): | |
... "details": {"columns": None, "rows": None}, | ||
... "code": "DATETIME_HAS_NAN", | ||
... "action_options": [] | ||
... }, | ||
... { | ||
... "message": "A frequency was detected in column 'days', but there are faulty datetime values that need to be addressed.", | ||
... "data_check_name": "DateTimeFormatDataCheck", | ||
... "level": "error", | ||
... "code": "DATETIME_HAS_UNEVEN_INTERVALS", | ||
... "details": {'columns': None, 'rows': None}, | ||
... "action_options": [ | ||
... { | ||
... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', | ||
... 'data_check_name': 'DateTimeFormatDataCheck', | ||
... 'metadata': { | ||
... 'columns': None, | ||
... 'is_target': True, | ||
... 'rows': None | ||
... }, | ||
... 'parameters': { | ||
... 'time_index': { | ||
... 'default_value': 'days', | ||
... 'parameter_type': 'global', | ||
... 'type': 'str' | ||
... }, | ||
... 'frequency_payload': { | ||
... 'default_value': ww_payload, | ||
... 'parameter_type': 'global', | ||
... 'type': 'tuple' | ||
... } | ||
... } | ||
... } | ||
... ] | ||
... } | ||
... ] | ||
... | ||
|
@@ -270,7 +447,34 @@ def validate(self, X, y): | |
DataCheckError( | ||
message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals.", | ||
data_check_name=self.name, | ||
message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, | ||
).to_dict() | ||
) | ||
else: | ||
messages.append( | ||
DataCheckError( | ||
message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.", | ||
data_check_name=self.name, | ||
message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, | ||
action_options=[ | ||
DataCheckActionOption( | ||
DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET, | ||
data_check_name=self.name, | ||
parameters={ | ||
"time_index": { | ||
"parameter_type": DCAOParameterType.GLOBAL, | ||
"type": "str", | ||
"default_value": col_name, | ||
}, | ||
"frequency_payload": { | ||
"parameter_type": DCAOParameterType.GLOBAL, | ||
"type": "tuple", | ||
"default_value": ww_payload, | ||
}, | ||
}, | ||
metadata={"is_target": True}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're not using is_target anywhere right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An EvalML consumer might check for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Happy to keep it! just wondering why since it didn't see it being "used" |
||
) | ||
], | ||
).to_dict() | ||
) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're adding this new error instead of adding it to everyone of the already existing data check errors to avoid having duplicate data check actions right?
I think this may be confusing UX to users because they'll see multiple errors but only the "DATETIME_HAS_UNEVEN_INTERVALS" will appear "fixable" via an action even though this action will fix all other errors.
This may be the best we can do for now. Tagging @Cmancuso so we can discuss further.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ParthivNaresh and I talked about this - errors will be consolidated in the future.