alteryx · ParthivNaresh · May 4, 2021 · Apr 29, 2021 · Apr 29, 2021 · Apr 29, 2021
diff --git a/.github/workflows/build_conda_pkg.yml b/.github/workflows/build_conda_pkg.yml
@@ -1,6 +1,8 @@
 name: Build Conda Package
 
 on:
+  pull_request:
+    types: [ opened, synchronize ]
   push:
     branches:
       - main

diff --git a/.github/workflows/linux_unit_tests.yml b/.github/workflows/linux_unit_tests.yml
@@ -61,6 +61,7 @@ jobs:
           source test_python/bin/activate
           make installdeps
           make installdeps-test
+          pip freeze
       - name: Erase Coverage
         run: |
           source test_python/bin/activate

diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml
@@ -66,6 +66,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install .
           python -m pip install -r test-requirements.txt
+          pip freeze
       - name: Run unit tests
         run: |
           . $env:USERPROFILE\Miniconda3\shell\condabin\conda-hook.ps1

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added `date_index` as a required parameter for TimeSeries problems :pr:`2217`
         * Have the ``OneHotEncoder`` return the transformed columns as booleans rather than floats :pr:`2170`
         * Added Oversampler transformer component to EvalML :pr:`2079`
         * Updated prediction explanations functions to allow pipelines with XGBoost estimators :pr:`2162`

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -216,7 +216,7 @@ def __init__(self,
                 max_iterations have precedence over stopping the search.
 
             problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
-                in time series problems, values should be passed in for the gap and max_delay variables.
+                in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
 
             train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
 
@@ -491,9 +491,9 @@ def _get_funct_name(function):
 
     def _validate_problem_configuration(self, problem_configuration=None):
         if self.problem_type in [ProblemTypes.TIME_SERIES_REGRESSION]:
-            required_parameters = {'gap', 'max_delay'}
+            required_parameters = {'date_index', 'gap', 'max_delay'}
             if not problem_configuration or not all(p in problem_configuration for p in required_parameters):
-                raise ValueError("user_parameters must be a dict containing values for at least the gap and max_delay "
+                raise ValueError("user_parameters must be a dict containing values for at least the date_index, gap, and max_delay "
                                  f"parameters. Received {problem_configuration}.")
         return problem_configuration or {}
 
@@ -714,12 +714,13 @@ def _get_baseline_pipeline(self):
             pipeline_class, pipeline_name = {ProblemTypes.TIME_SERIES_REGRESSION: (TimeSeriesRegressionPipeline, "Time Series Baseline Regression Pipeline"),
                                              ProblemTypes.TIME_SERIES_MULTICLASS: (TimeSeriesMulticlassClassificationPipeline, "Time Series Baseline Multiclass Pipeline"),
                                              ProblemTypes.TIME_SERIES_BINARY: (TimeSeriesBinaryClassificationPipeline, "Time Series Baseline Binary Pipeline")}[self.problem_type]
+            date_index = self.problem_configuration['date_index']
             gap = self.problem_configuration['gap']
             max_delay = self.problem_configuration['max_delay']
             baseline = pipeline_class(component_graph=["Time Series Baseline Estimator"],
                                       custom_name=pipeline_name,
-                                      parameters={"pipeline": {"gap": gap, "max_delay": max_delay},
-                                                  "Time Series Baseline Estimator": {"gap": gap, "max_delay": max_delay}})
+                                      parameters={"pipeline": {"date_index": date_index, "gap": gap, "max_delay": max_delay},
+                                                  "Time Series Baseline Estimator": {"date_index": date_index, "gap": gap, "max_delay": max_delay}})
         return baseline
 
     def _add_baseline_pipelines(self):

diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py
@@ -49,7 +49,7 @@ def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=
         y (ww.DataColumn, pd.Series): The target training data of length [n_samples].
         problem_type (ProblemType): The type of machine learning problem.
         problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
-            in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None.
+            in time series problems, values should be passed in for the date_index, gap, and max_delay variables. Defaults to None.
         n_splits (int, None): The number of CV splits, if applicable. Defaults to 3.
         shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True.
         random_seed (int): Seed for the random number generator. Defaults to 0.
@@ -63,7 +63,7 @@ def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=
         if not problem_configuration:
             raise ValueError("problem_configuration is required for time series problem types")
         return TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'),
-                               max_delay=problem_configuration.get('max_delay'))
+                               max_delay=problem_configuration.get('max_delay'), date_index=problem_configuration.get('date_index'))
     if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
         if problem_type == ProblemTypes.REGRESSION:
             return TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle)

diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py
@@ -5,15 +5,14 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
-from evalml.utils import import_or_raise
+from evalml.utils import import_or_raise, infer_feature_types
 
 
 class ARIMARegressor(Estimator):
     """
     Autoregressive Integrated Moving Average Model.
     The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order.
     More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html
-
     """
     name = "ARIMA Regressor"
     hyperparameter_ranges = {
@@ -24,7 +23,7 @@ class ARIMARegressor(Estimator):
     model_family = ModelFamily.ARIMA
     supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
 
-    def __init__(self, date_column=None, trend='n', p=1, d=0, q=0,
+    def __init__(self, date_index=None, trend='n', p=1, d=0, q=0,
                  random_seed=0, **kwargs):
         """
         Arguments:
@@ -41,7 +40,7 @@ def __init__(self, date_column=None, trend='n', p=1, d=0, q=0,
                       'trend': trend}
 
         parameters.update(kwargs)
-        self.date_column = date_column
+        self.date_index = date_index
 
         p_error_msg = "ARIMA is not installed. Please install using `pip install statsmodels`."
 
@@ -60,40 +59,23 @@ def __init__(self, date_column=None, trend='n', p=1, d=0, q=0,
                          component_obj=None,
                          random_seed=random_seed)
 
-    def _get_dates_fit(self, X, y):
-        date_col = None
-
-        if isinstance(y.index, pd.DatetimeIndex):
-            date_col = y.index
-        if X is not None:
-            if self.date_column in X.columns:
-                date_col = X.pop(self.date_column)
-            elif isinstance(X.index, pd.DatetimeIndex):
-                date_col = X.index
-
-        if date_col is None:
-            msg = "ARIMA regressor requires input data X to have a datetime column specified by the 'date_column' parameter. " \
-                  "If not it will look for the datetime column in the index of X or y."
-            raise ValueError(msg)
-        return date_col
-
-    def _get_dates_predict(self, X, y):
+    def _get_dates(self, X, y):
         date_col = None
-
         if y is not None:
-            if isinstance(y.index, pd.DatetimeIndex):
+            y_index_type = infer_feature_types(pd.Series(y.index)).logical_type.type_string
+            if y_index_type == 'datetime':
                 date_col = y.index
         if X is not None:
-            if self.date_column in X.columns:
-                date_col = X.pop(self.date_column)
-            elif isinstance(X.index, pd.DatetimeIndex):
+            X_index_type = infer_feature_types(pd.Series(X.index)).logical_type.type_string
+            if self.date_index in X.columns:
+                date_col = X.pop(self.date_index)
+            elif X_index_type == 'datetime':
                 date_col = X.index
-
         if date_col is None:
-            msg = "ARIMA regressor requires input data X to have a datetime column specified by the 'date_column' parameter. " \
+            msg = "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. " \
                   "If not it will look for the datetime column in the index of X or y."
             raise ValueError(msg)
-        return date_col
+        return date_col, X
 
     def _match_indices(self, X, y, date_col):
         if X is not None:
@@ -110,7 +92,7 @@ def fit(self, X, y=None):
         arima = import_or_raise("statsmodels.tsa.arima.model", error_msg=p_error_msg)
 
         X, y = self._manage_woodwork(X, y)
-        dates = self._get_dates_fit(X, y)
+        dates, X = self._get_dates(X, y)
         X, y = self._match_indices(X, y, dates)
         new_params = {}
         for key, val in self.parameters.items():
@@ -126,7 +108,7 @@ def fit(self, X, y=None):
 
     def predict(self, X, y=None):
         X, y = self._manage_woodwork(X, y)
-        dates = self._get_dates_predict(X, y)
+        dates, X = self._get_dates(X, y)
         X, y = self._match_indices(X, y, dates)
         start = dates.min()
         end = dates.max()

diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py
@@ -49,14 +49,15 @@ class DateTimeFeaturizer(Transformer):
                           "day_of_week": _extract_day_of_week,
                           "hour": _extract_hour}
 
-    def __init__(self, features_to_extract=None, encode_as_categories=False, random_seed=0, **kwargs):
+    def __init__(self, features_to_extract=None, encode_as_categories=False, date_index=None, random_seed=0, **kwargs):
         """Extracts features from DateTime columns
 
         Arguments:
             features_to_extract (list): List of features to extract. Valid options include "year", "month", "day_of_week", "hour".
             encode_as_categories (bool): Whether day-of-week and month features should be encoded as pandas "category" dtype.
                 This allows OneHotEncoders to encode these features.
             random_seed (int): Seed for the random number generator. Defaults to 0.
+            date_index (str): Name of the column containing the datetime information used to order the data. Ignored.
         """
         if features_to_extract is None:
             features_to_extract = ["year", "month", "day_of_week", "hour"]
@@ -65,9 +66,9 @@ def __init__(self, features_to_extract=None, encode_as_categories=False, random_
             raise ValueError("{} are not valid options for features_to_extract".format(", ".join([f"'{feature}'" for feature in invalid_features])))
 
         parameters = {"features_to_extract": features_to_extract,
-                      "encode_as_categories": encode_as_categories}
+                      "encode_as_categories": encode_as_categories,
+                      "date_index": date_index}
         parameters.update(kwargs)
-
         self._date_time_col_names = None
         self._categories = {}
         self.encode_as_categories = encode_as_categories

diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py
@@ -16,10 +16,11 @@ class DelayedFeatureTransformer(Transformer):
     hyperparameter_ranges = {}
     needs_fitting = False
 
-    def __init__(self, max_delay=2, delay_features=True, delay_target=True, gap=1, random_seed=0, **kwargs):
+    def __init__(self, date_index=None, max_delay=2, delay_features=True, delay_target=True, gap=1, random_seed=0, **kwargs):
         """Creates a DelayedFeatureTransformer.
 
         Arguments:
+            date_index (str): Name of the column containing the datetime information used to order the data. Ignored.
             max_delay (int): Maximum number of time units to delay each feature.
             delay_features (bool): Whether to delay the input features.
             delay_target (bool): Whether to delay the target.
@@ -29,14 +30,15 @@ def __init__(self, max_delay=2, delay_features=True, delay_target=True, gap=1, r
                 at 1.
             random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided.
         """
+        self.date_index = date_index
         self.max_delay = max_delay
         self.delay_features = delay_features
         self.delay_target = delay_target
 
         # If 0, start at 1
         self.start_delay_for_target = int(gap == 0)
 
-        parameters = {"max_delay": max_delay, "delay_target": delay_target, "delay_features": delay_features,
+        parameters = {"date_index": date_index, "max_delay": max_delay, "delay_target": delay_target, "delay_features": delay_features,
                       "gap": gap}
         parameters.update(kwargs)
         super().__init__(parameters=parameters, random_seed=random_seed)

diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py
@@ -30,17 +30,16 @@ def __init__(self, component_graph, parameters=None, custom_name=None, custom_hy
                 [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
                 ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
             parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
-                 An empty dictionary or None implies using all default values for component parameters. Pipeline-level
-                 parameters such as gap and max_delay must be specified with the "pipeline" key. For example:
-                 Pipeline(parameters={"pipeline": {"max_delay": 4, "gap": 2}}).
-            custom_name (str): Custom name for the pipeline. Defaults to None.
-            custom_hyperparameters (dict): Custom hyperparameter range for the pipeline. Defaults to None.
+                 An empty dictionary {} implies using all default values for component parameters. Pipeline-level
+                 parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
+                 Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
             random_seed (int): Seed for the random number generator. Defaults to 0.
         """
-        if parameters is None or "pipeline" not in parameters:
-            raise ValueError("gap and max_delay parameters cannot be omitted from the parameters dict. "
+        if "pipeline" not in parameters:
+            raise ValueError("date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. "
                              "Please specify them as a dictionary with the key 'pipeline'.")
         pipeline_params = parameters["pipeline"]
+        self.date_index = pipeline_params['date_index']
         self.gap = pipeline_params['gap']
         self.max_delay = pipeline_params['max_delay']
         super().__init__(component_graph,

diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py
@@ -26,17 +26,16 @@ def __init__(self, component_graph, parameters=None, custom_name=None, custom_hy
                 [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
                 ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
             parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
-                 An empty dictionary or None implies using all default values for component parameters. Pipeline-level
-                 parameters such as gap and max_delay must be specified with the "pipeline" key. For example:
-                 Pipeline(parameters={"pipeline": {"max_delay": 4, "gap": 2}}).
-            custom_name (str): Custom name for the pipeline. Defaults to None.
-            custom_hyperparameters (dict): Custom hyperparameter range for the pipeline. Defaults to None.
+                 An empty dictionary {} implies using all default values for component parameters. Pipeline-level
+                 parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
+                 Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
             random_seed (int): Seed for the random number generator. Defaults to 0.
         """
-        if parameters is None or "pipeline" not in parameters:
-            raise ValueError("gap and max_delay parameters cannot be omitted from the parameters dict. "
+        if "pipeline" not in parameters:
+            raise ValueError("date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. "
                              "Please specify them as a dictionary with the key 'pipeline'.")
         pipeline_params = parameters["pipeline"]
+        self.date_index = pipeline_params['date_index']
         self.gap = pipeline_params['gap']
         self.max_delay = pipeline_params['max_delay']
         super().__init__(component_graph,

diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py
@@ -6,7 +6,7 @@
 class TimeSeriesSplit(BaseCrossValidator):
     """Rolling Origin Cross Validation for time series problems."""
 
-    def __init__(self, max_delay=0, gap=0, n_splits=3):
+    def __init__(self, max_delay=0, gap=0, date_index=None, n_splits=3):
         """Create a TimeSeriesSplit.
 
         This class uses max_delay and gap values to take into account that evalml time series pipelines perform
@@ -21,10 +21,12 @@ def __init__(self, max_delay=0, gap=0, n_splits=3):
                 of rows of the current split to avoid "throwing out" more data than in necessary.
             gap (int): Gap used in time series problem. Time series pipelines shift the target variable by gap rows
                 since we are interested in
+            date_index (str): Name of the column containing the datetime information used to order the data.
             n_splits (int): number of data splits to make.
             """
         self.max_delay = max_delay
         self.gap = gap
+        self.date_index = date_index
         self.n_splits = n_splits
         self._splitter = SkTimeSeriesSplit(n_splits=n_splits)
 

diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -55,7 +55,7 @@ def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, ran
         y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
         problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
         problem_configuration (dict): Additional parameters needed to configure the search. For example,
-            in time series problems, values should be passed in for the gap and max_delay variables.
+            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
         test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
         random_seed (int): Seed for the random number generator. Defaults to 0.