From 23b0efaf95010eca882192b5d5679d5a9cc0490c Mon Sep 17 00:00:00 2001
From: Maximilian Christ <max.christ@me.com>
Date: Sun, 9 Sep 2018 11:48:46 +0200
Subject: [PATCH] Improve warnings (#433)

* improve warnings and sanity checks for select features
---
 .../test_relevant_feature_extraction.py       | 38 +++++++++++++++----
 .../units/feature_selection/test_selection.py | 15 +++++---
 .../utilities/test_dataframe_functions.py     | 16 +++++++-
 tsfresh/convenience/relevant_extraction.py    | 16 +++++++-
 .../feature_extraction/feature_calculators.py |  4 +-
 tsfresh/feature_selection/selection.py        | 21 +++++-----
 tsfresh/utilities/dataframe_functions.py      | 22 +++++++++++
 7 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/tests/integrations/test_relevant_feature_extraction.py b/tests/integrations/test_relevant_feature_extraction.py
index e7fd66e45..4a437b23f 100644
--- a/tests/integrations/test_relevant_feature_extraction.py
+++ b/tests/integrations/test_relevant_feature_extraction.py
@@ -50,17 +50,16 @@ class RelevantFeatureExtractionTestCase(TestCase):
 
     def setUp(self):
         np.random.seed(42)
-
-        y = pd.Series(np.random.binomial(1, 0.5, 10), index=range(10))
+        y = pd.Series(np.random.binomial(1, 0.5, 20), index=range(20))
         df = pd.DataFrame(index=range(100))
 
         df["a"] = np.random.normal(0, 1, 100)
         df["b"] = np.random.normal(0, 1, 100)
-        df["id"] = np.repeat(range(10), 10)
+        df["id"] = np.repeat(range(20), 5)
 
-        X = pd.DataFrame(index=range(10))
-        X["f1"] = np.random.normal(0, 1, 10)
-        X["f2"] = np.random.normal(0, 1, 10)
+        X = pd.DataFrame(index=range(20))
+        X["f1"] = np.random.normal(0, 1, 20)
+        X["f2"] = np.random.normal(0, 1, 20)
 
         self.df = df
         self.X = X
@@ -87,4 +86,29 @@ def test_extraction_null_as_column_name(self):
 
         df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)})
         X3 = extract_features(df3, column_id=2, column_sort=1)
-        self.assertEqual(len(X3), 2)
\ No newline at end of file
+        self.assertEqual(len(X3), 2)
+
+    def test_raises_mismatch_index_df_and_y_df_more(self):
+        y = pd.Series(range(3), index=[1, 2, 3])
+        df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
+                   "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
+        self.assertRaises(ValueError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")
+
+    def test_raises_mismatch_index_df_and_y_y_more(self):
+        y = pd.Series(range(4), index=[1, 2, 3, 4])
+        df = pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
+        self.assertRaises(ValueError, extract_relevant_features, df, y, None, None, None, "id", None, "val")
+
+    def test_raises_y_not_series(self):
+        y = np.arange(10)
+        df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
+                   "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
+        self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")
+
+    def test_raises_y_not_more_than_one_label(self):
+        y = pd.Series(1, index=[1, 2, 3])
+        df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
+                   "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
+        self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")
+
+
diff --git a/tests/units/feature_selection/test_selection.py b/tests/units/feature_selection/test_selection.py
index 2d7c30866..8f9716a02 100644
--- a/tests/units/feature_selection/test_selection.py
+++ b/tests/units/feature_selection/test_selection.py
@@ -12,20 +12,19 @@
 
 class TestSelectFeatures:
     def test_assert_list(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(AssertionError):
             select_features(pd.DataFrame(index=range(2)),[1,2,3])
 
-
     def test_assert_one_row_X(self):
         X = pd.DataFrame([1], index=[1])
         y = pd.Series([1], index=[1])
-        with pytest.raises(ValueError):
+        with pytest.raises(AssertionError):
             select_features(X, y)
 
     def test_assert_one_label_y(self):
         X = pd.DataFrame([10, 10], index=[1, 2])
         y = pd.Series([1, 1], index=[1, 2])
-        with pytest.raises(ValueError):
+        with pytest.raises(AssertionError):
             select_features(X, y)
 
     def test_assert_different_index(self):
@@ -34,13 +33,17 @@ def test_assert_different_index(self):
         with pytest.raises(ValueError):
             select_features(X, y)
 
-
     def test_assert_shorter_y(self):
         X = pd.DataFrame([1, 2], index=[1, 2])
         y = np.array([1])
-        with pytest.raises(ValueError):
+        with pytest.raises(AssertionError):
             select_features(X, y)
 
+    def test_assert_X_is_DataFrame(self):
+        X = np.array([[1, 2], [1, 2]])
+        y = np.array([1])
+        with pytest.raises(AssertionError):
+            select_features(X, y)
 
     def test_selects_for_each_class(self):
         df = pd.DataFrame()
diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py
index 852d0cdda..593910a41 100644
--- a/tests/units/utilities/test_dataframe_functions.py
+++ b/tests/units/utilities/test_dataframe_functions.py
@@ -10,6 +10,8 @@
 import six
 from pandas.testing import assert_frame_equal, assert_series_equal
 
+from tsfresh.utilities.dataframe_functions import get_ids
+
 
 class NormalizeTestCase(TestCase):
     def test_with_dictionaries_one_row(self):
@@ -647,4 +649,16 @@ def test_make_forecasting_frame_pdSeries(self):
                                                               "2011-01-01 02:00:00"])
                                     })
         assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
-        assert_series_equal(y, expected_y)
\ No newline at end of file
+        assert_series_equal(y, expected_y)
+
+
+class GetIDsTestCase(TestCase):
+
+    def test_get_id__correct_DataFrame(self):
+        df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
+        self.assertEqual(get_ids(df, "id"), set([1, 2]))
+
+    def test_get_id__correct_dict(self):
+        df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
+                   "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
+        self.assertEqual(get_ids(df_dict, "id"), set([1, 2, 3, 4]))
diff --git a/tsfresh/convenience/relevant_extraction.py b/tsfresh/convenience/relevant_extraction.py
index cca2951ba..99910e9bc 100644
--- a/tsfresh/convenience/relevant_extraction.py
+++ b/tsfresh/convenience/relevant_extraction.py
@@ -7,7 +7,7 @@
 from tsfresh.feature_extraction import extract_features
 from tsfresh import defaults
 from tsfresh.feature_selection import select_features
-from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute
+from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute, get_ids
 
 
 def extract_relevant_features(timeseries_container, y, X=None,
@@ -133,9 +133,23 @@ def extract_relevant_features(timeseries_container, y, X=None,
 
     :return: Feature matrix X, possibly extended with relevant time series features.
     """
+
+    assert isinstance(y, pd.Series)
+    assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided"
+
     if X is not None:
         timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index)
 
+    ids_container = get_ids(df_or_dict=timeseries_container, column_id=column_id)
+    ids_y = set(y.index)
+    if ids_container != ids_y:
+        if len(ids_container - ids_y) > 0:
+            raise ValueError("The following ids are in the time series container but are missing in y: "
+                             "{}".format(ids_container - ids_y))
+        if len(ids_y - ids_container) > 0:
+            raise ValueError("The following ids are in y but are missing inside the time series container: "
+                             "{}".format(ids_y - ids_container))
+
     X_ext = extract_features(timeseries_container,
                              default_fc_parameters=default_fc_parameters,
                              kind_to_fc_parameters=kind_to_fc_parameters,
diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
index 41458ffee..48f5c4320 100644
--- a/tsfresh/feature_extraction/feature_calculators.py
+++ b/tsfresh/feature_extraction/feature_calculators.py
@@ -1789,14 +1789,12 @@ def energy_ratio_by_chunks(x, param):
     which is the segment number (starting at zero) to return a feature on.
 
     If the length of the time series is not a multiple of the number of segments, the remaining data points are
-    distributed on the bins starting from the first. For example, if your time series consists of 8 datapoints, the
+    distributed on the bins starting from the first. For example, if your time series consists of 8 entries, the
     first two bins will contain 3 and the last two values, e.g. `[ 0.,  1.,  2.], [ 3.,  4.,  5.]` and `[ 6.,  7.]`.
 
     Note that the answer for `num_segments = 1` is a trivial "1" but we handle this scenario
     in case somebody calls it. Sum of the ratios should be 1.0.
 
-    Returns an AssertionError for N <= 0
-
     :param x: the time series to calculate the feature of
     :type x: pandas.Series
     :param param: contains dictionaries {"num_segments": N, "segment_focus": i} with N, i both ints
diff --git a/tsfresh/feature_selection/selection.py b/tsfresh/feature_selection/selection.py
index a3d526834..5a8c3729a 100644
--- a/tsfresh/feature_selection/selection.py
+++ b/tsfresh/feature_selection/selection.py
@@ -128,21 +128,18 @@ def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FO
     :raises: ``ValueError`` when the target vector does not fit to the feature matrix
              or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`.
     """
+    assert isinstance(X, pd.DataFrame), "Please pass features in X as pandas.DataFrame."
     check_for_nans_in_columns(X)
+    assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \
+                                                   "pandas.Series, numpy.ndarray"
+    assert len(y) > 1, "y must contain at least two samples."
+    assert len(X) == len(y), "X and y must contain the same number of samples."
+    assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided"
 
-    if not isinstance(y, (pd.Series, np.ndarray)):
-        raise TypeError("The type of target vector y must be one of: pandas.Series, numpy.ndarray")
-
-    if len(X) < 2:
-        raise ValueError("X must contain at least two samples.")
-    elif len(set(y)) == 1:
-        raise ValueError("y contains only one kind of label, no feature selection possible.")
-    elif isinstance(y, pd.Series) and not X.index.isin(y.index).all():
-        raise ValueError("Index of X must be a subset of y's index")
-    elif isinstance(y, np.ndarray):
-        if not len(y) >= len(X):
-            raise ValueError("Target vector y is shorter than feature matrix X")
+    if isinstance(y, pd.Series) and set(X.index) != set(y.index):
+        raise ValueError("Index of X and y must be identical if provided")
 
+    if isinstance(y, np.ndarray):
         y = pd.Series(y, index=X.index)
 
     relevance_table = calculate_relevance_table(
diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py
index 41bda426a..7198a7c9c 100644
--- a/tsfresh/utilities/dataframe_functions.py
+++ b/tsfresh/utilities/dataframe_functions.py
@@ -209,6 +209,28 @@ def restrict_input_to_index(df_or_dict, column_id, index):
     return df_or_dict_restricted
 
 
+def get_ids(df_or_dict, column_id):
+    """
+    Aggregates all ids in column_id from the time series container `
+
+    :param df_or_dict: a pandas DataFrame or a dictionary.
+    :type df_or_dict: pandas.DataFrame or dict
+    :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
+        It is not allowed to have NaN values in this column.
+    :type column_id: basestring
+
+    :return: as set with all existing ids in energy_ratio_by_chunks
+    :rtype: Set
+    :raise: ``TypeError`` if df_or_dict is not of type dict or pandas.DataFrame
+    """
+    if isinstance(df_or_dict, pd.DataFrame):
+        return set(df_or_dict[column_id])
+    elif isinstance(df_or_dict, dict):
+        return set.union(*[set(df[column_id]) for _, df in df_or_dict.items()])
+    else:
+        raise TypeError("df_or_dict should be of type dict or pandas.DataFrame")
+
+
 # todo: add more testcases
 # todo: rewrite in a more straightforward way
 def _normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value):