Merge pull request #722 from blue-yonder/feature/some-small-refactoring

Let tsfresh choose the value column if possible and increase test coverage
blue-yonder · Jul 15, 2020 · 2f8f0fa · 2f8f0fa
2 parents ab5bf76 + bb371e8
commit 2f8f0fa
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 19 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -28,6 +28,7 @@ We changed the default branch from "master" to "main".
     - Fixed readthedocs (#695, #696)
     - Fix spark and dask after #705 and for non-id named id columns (#712)
     - Fix in the forecasting notebook (#729)
+    - Let tsfresh choose the value column if possible (#722)
 
 Version 0.16.0
 ==============

diff --git a/tests/units/feature_extraction/test_data.py b/tests/units/feature_extraction/test_data.py
@@ -1,3 +1,5 @@
+import math
+
 import numpy as np
 import pandas as pd
 
@@ -35,19 +37,31 @@
 
 
 class DataAdapterTestCase(DataTestCase):
-
     def test_long_tsframe(self):
         df = self.create_test_data_sample()
         data = LongTsFrameAdapter(df, "id", "kind", "val", "sort")
 
         self.assert_tsdata(data, TEST_DATA_EXPECTED_TUPLES)
 
+    def test_long_tsframe_no_value_column(self):
+        df = self.create_test_data_sample()
+        data = LongTsFrameAdapter(df, "id", "kind", None, "sort")
+
+        self.assert_tsdata(data, TEST_DATA_EXPECTED_TUPLES)
+
     def test_wide_tsframe(self):
         df = self.create_test_data_sample_wide()
         data = WideTsFrameAdapter(df, "id", "sort")
 
         self.assert_tsdata(data, WIDE_TEST_DATA_EXPECTED_TUPLES)
 
+    def test_wide_tsframe_without_sort(self):
+        df = self.create_test_data_sample_wide()
+        del df["sort"]
+        data = WideTsFrameAdapter(df, "id")
+
+        self.assert_tsdata(data, WIDE_TEST_DATA_EXPECTED_TUPLES)
+
     def test_dict_tsframe(self):
         df = {key: df for key, df in self.create_test_data_sample().groupby(["kind"])}
         data = TsDictAdapter(df, "id", "val", "sort")
@@ -58,7 +72,13 @@ def assert_tsdata(self, data, expected):
         self.assertEqual(len(data), len(expected))
         self.assertEqual(sum(1 for _ in data), len(data))
         self.assertEqual(sum(1 for _ in data.partition(1)), len(expected))
+        self.assertEqual(sum(1 for _ in data.partition(2)), math.ceil(len(expected) / 2))
+        self.assertEqual(sum(1 for _ in data.partition(3)), math.ceil(len(expected) / 3))
         self.assertEqual((sum(sum(1 for _ in g) for g in data.partition(1))), len(data))
+        first_partition = next(data.partition(1))
+        self.assertEqual(len(first_partition), 1)
+        first_partition = next(data.partition(4))
+        self.assertEqual(len(first_partition), 4)
         self.assert_data_chunk_object_equal(data, expected)
 
     def assert_data_chunk_object_equal(self, result, expected):
@@ -97,6 +117,16 @@ def test_with_dictionaries_two_rows(self):
                     ("id_1", 'b', pd.Series([1, 2], index=[1, 0], name="value"))]
         self.assert_data_chunk_object_equal(result, expected)
 
+    def test_with_dictionaries_two_rows(self):
+        test_df = pd.DataFrame([{"value": 1, "id": "id_1"},
+                                {"value": 2, "id": "id_1"}])
+        test_dict = {"a": test_df, "b": test_df}
+
+        result = to_tsdata(test_dict, column_id="id", column_value="value")
+        expected = [("id_1", 'a', pd.Series([1, 2], index=[0, 1], name="value")),
+                    ("id_1", 'b', pd.Series([1, 2], index=[0, 1], name="value"))]
+        self.assert_data_chunk_object_equal(result, expected)
+
     def test_wide_dataframe_order_preserved_with_sort_column(self):
         """ verifies that the order of the sort column from a wide time series container is preserved
         """
@@ -122,6 +152,10 @@ def test_with_wrong_input(self):
         self.assertRaises(ValueError, to_tsdata, test_df,
                           "strange_id", "kind", "value", "sort")
 
+        test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "value_2": 1, "sort": 1}])
+        self.assertRaises(ValueError, to_tsdata, test_df,
+                          "strange_id", "kind", None, "sort")
+
         test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
         self.assertRaises(ValueError, to_tsdata, test_df,
                           "id", "strange_kind", "value", "sort")
@@ -137,6 +171,16 @@ def test_with_wrong_input(self):
         test_df = pd.DataFrame([{"id": 2}, {"id": 1}])
         test_dict = {"a": test_df, "b": test_df}
 
+        # column_id needs to be given
+        self.assertRaises(ValueError, to_tsdata, test_df,
+                          None, "a", "b", None)
+        self.assertRaises(ValueError, to_tsdata, test_df,
+                          None, "a", "b", "a")
+        self.assertRaises(ValueError, to_tsdata, test_dict,
+                          None, "a", "b", None)
+        self.assertRaises(ValueError, to_tsdata, test_dict,
+                          None, "a", "b", "a")
+
         # If there are more than one column, the algorithm can not choose the correct column
         self.assertRaises(ValueError, to_tsdata, test_dict,
                           "id", None, None, None)
@@ -171,3 +215,7 @@ def test_with_wrong_input(self):
         test_df = pd.DataFrame([{"id": 0, "sort": 0}])
         self.assertRaises(ValueError, to_tsdata, test_df,
                           "id", None, None, "sort")
+
+        test_df = [1, 2, 3]
+        self.assertRaises(ValueError, to_tsdata, test_df,
+                          "a", "b", "c", "d")
diff --git a/tsfresh/feature_extraction/data.py b/tsfresh/feature_extraction/data.py
@@ -147,8 +147,16 @@ def _check_nan(df, *columns):
             raise ValueError("Column must not contain NaN values: {}".format(col))
 
 
-class WideTsFrameAdapter(SliceableTsData):
+def _get_value_columns(df, *other_columns):
+    value_columns = [col for col in df.columns if col not in other_columns]
+
+    if len(value_columns) == 0:
+        raise ValueError("Could not guess the value column! Please hand it to the function as an argument.")
+
+    return value_columns
+
 
+class WideTsFrameAdapter(SliceableTsData):
     def __init__(self, df, column_id, column_sort=None, value_columns=None):
         """
         Adapter for Pandas DataFrames in wide format, where multiple columns contain different time series for
@@ -164,17 +172,16 @@ def __init__(self, df, column_id, column_sort=None, value_columns=None):
         :type column_sort: str|None
 
         :param value_columns: list of column names to treat as time series values.
-            If `None`, all columns except `column_id` and `column_sort` will be used.
+            If `None` or empty, all columns except `column_id` and `column_sort` will be used.
         :type value_columns: list[str]|None
         """
+        if column_id is None:
+            raise ValueError("A value for column_id needs to be supplied")
 
         _check_nan(df, column_id)
 
         if value_columns is None:
-            value_columns = [col for col in df.columns if col not in [column_id, column_sort]]
-
-        if len(value_columns) == 0:
-            raise ValueError("You must provide at least one value column")
+            value_columns = _get_value_columns(df, column_id, column_sort)
 
         _check_nan(df, *value_columns)
         _check_colname(*value_columns)
@@ -208,7 +215,6 @@ def slice(self, offset, length=None):
 
 
 class LongTsFrameAdapter(TsData):
-
     def __init__(self, df, column_id, column_kind, column_value, column_sort=None):
         """
         Adapter for Pandas DataFrames in long format, where different time series for the same id are
@@ -229,12 +235,22 @@ def __init__(self, df, column_id, column_kind, column_value, column_sort=None):
         :param column_sort: the name of the column to sort on
         :type column_sort: str|None
         """
+        if column_id is None:
+            raise ValueError("A value for column_id needs to be supplied")
+        if column_kind is None:
+            raise ValueError("A value for column_kind needs to be supplied")
+
+        if column_value is None:
+            possible_value_columns = _get_value_columns(df, column_id, column_sort, column_kind)
+            if len(possible_value_columns) != 1:
+                raise ValueError("Could not guess the value column! Please hand it to the function as an argument.")
+            self.column_value = possible_value_columns[0]
+        else:
+            self.column_value = column_value
 
-        _check_nan(df, column_id, column_kind, column_value)
+        _check_nan(df, column_id, column_kind, self.column_value)
         _check_colname(column_kind)
 
-        self.column_value = column_value
-
         if column_sort is not None:
             _check_nan(df, column_sort)
             self.df_grouped = df.sort_values([column_sort]).groupby([column_id, column_kind])
@@ -291,7 +307,7 @@ def __len__(self):
         return sum(grouped_df.ngroups for grouped_df in self.grouped_dict.values())
 
 
-def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_sort=None):
+def to_tsdata(df, column_id, column_kind=None, column_value=None, column_sort=None):
     """
     Wrap supported data formats as a TsData object, i.e. an iterable of individual time series.
 
@@ -316,7 +332,7 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so
     :type df: pd.DataFrame|dict|TsData
 
     :param column_id: The name of the id column to group by.
-    :type column_id: str|None
+    :type column_id: str
 
     :param column_kind: The name of the column keeping record on the kind of the value.
     :type column_kind: str|None
@@ -335,13 +351,13 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so
         return df
 
     elif isinstance(df, pd.DataFrame):
-        if column_value is not None:
-            if column_kind is not None:
-                return LongTsFrameAdapter(df, column_id, column_kind, column_value, column_sort)
-            else:
-                return WideTsFrameAdapter(df, column_id, column_sort, [column_value])
+        if column_kind is not None:
+            return LongTsFrameAdapter(df, column_id, column_kind, column_value, column_sort)
         else:
-            return WideTsFrameAdapter(df, column_id, column_sort)
+            if column_value is not None:
+                return WideTsFrameAdapter(df, column_id, column_sort, [column_value])
+            else:
+                return WideTsFrameAdapter(df, column_id, column_sort)
 
     elif isinstance(df, dict):
         return TsDictAdapter(df, column_id, column_value, column_sort)