Skip to content

Commit

Permalink
Merge pull request #722 from blue-yonder/feature/some-small-refactoring
Browse files Browse the repository at this point in the history
Let tsfresh choose the value column if possible and increase test coverage
  • Loading branch information
nils-braun committed Jul 15, 2020
2 parents ab5bf76 + bb371e8 commit 2f8f0fa
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ We changed the default branch from "master" to "main".
- Fixed readthedocs (#695, #696)
- Fix spark and dask after #705 and for non-id named id columns (#712)
- Fix in the forecasting notebook (#729)
- Let tsfresh choose the value column if possible (#722)

Version 0.16.0
==============
Expand Down
50 changes: 49 additions & 1 deletion tests/units/feature_extraction/test_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import math

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -35,19 +37,31 @@


class DataAdapterTestCase(DataTestCase):

def test_long_tsframe(self):
df = self.create_test_data_sample()
data = LongTsFrameAdapter(df, "id", "kind", "val", "sort")

self.assert_tsdata(data, TEST_DATA_EXPECTED_TUPLES)

def test_long_tsframe_no_value_column(self):
df = self.create_test_data_sample()
data = LongTsFrameAdapter(df, "id", "kind", None, "sort")

self.assert_tsdata(data, TEST_DATA_EXPECTED_TUPLES)

def test_wide_tsframe(self):
df = self.create_test_data_sample_wide()
data = WideTsFrameAdapter(df, "id", "sort")

self.assert_tsdata(data, WIDE_TEST_DATA_EXPECTED_TUPLES)

def test_wide_tsframe_without_sort(self):
df = self.create_test_data_sample_wide()
del df["sort"]
data = WideTsFrameAdapter(df, "id")

self.assert_tsdata(data, WIDE_TEST_DATA_EXPECTED_TUPLES)

def test_dict_tsframe(self):
df = {key: df for key, df in self.create_test_data_sample().groupby(["kind"])}
data = TsDictAdapter(df, "id", "val", "sort")
Expand All @@ -58,7 +72,13 @@ def assert_tsdata(self, data, expected):
self.assertEqual(len(data), len(expected))
self.assertEqual(sum(1 for _ in data), len(data))
self.assertEqual(sum(1 for _ in data.partition(1)), len(expected))
self.assertEqual(sum(1 for _ in data.partition(2)), math.ceil(len(expected) / 2))
self.assertEqual(sum(1 for _ in data.partition(3)), math.ceil(len(expected) / 3))
self.assertEqual((sum(sum(1 for _ in g) for g in data.partition(1))), len(data))
first_partition = next(data.partition(1))
self.assertEqual(len(first_partition), 1)
first_partition = next(data.partition(4))
self.assertEqual(len(first_partition), 4)
self.assert_data_chunk_object_equal(data, expected)

def assert_data_chunk_object_equal(self, result, expected):
Expand Down Expand Up @@ -97,6 +117,16 @@ def test_with_dictionaries_two_rows(self):
("id_1", 'b', pd.Series([1, 2], index=[1, 0], name="value"))]
self.assert_data_chunk_object_equal(result, expected)

def test_with_dictionaries_two_rows(self):
test_df = pd.DataFrame([{"value": 1, "id": "id_1"},
{"value": 2, "id": "id_1"}])
test_dict = {"a": test_df, "b": test_df}

result = to_tsdata(test_dict, column_id="id", column_value="value")
expected = [("id_1", 'a', pd.Series([1, 2], index=[0, 1], name="value")),
("id_1", 'b', pd.Series([1, 2], index=[0, 1], name="value"))]
self.assert_data_chunk_object_equal(result, expected)

def test_wide_dataframe_order_preserved_with_sort_column(self):
""" verifies that the order of the sort column from a wide time series container is preserved
"""
Expand All @@ -122,6 +152,10 @@ def test_with_wrong_input(self):
self.assertRaises(ValueError, to_tsdata, test_df,
"strange_id", "kind", "value", "sort")

test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "value_2": 1, "sort": 1}])
self.assertRaises(ValueError, to_tsdata, test_df,
"strange_id", "kind", None, "sort")

test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
self.assertRaises(ValueError, to_tsdata, test_df,
"id", "strange_kind", "value", "sort")
Expand All @@ -137,6 +171,16 @@ def test_with_wrong_input(self):
test_df = pd.DataFrame([{"id": 2}, {"id": 1}])
test_dict = {"a": test_df, "b": test_df}

# column_id needs to be given
self.assertRaises(ValueError, to_tsdata, test_df,
None, "a", "b", None)
self.assertRaises(ValueError, to_tsdata, test_df,
None, "a", "b", "a")
self.assertRaises(ValueError, to_tsdata, test_dict,
None, "a", "b", None)
self.assertRaises(ValueError, to_tsdata, test_dict,
None, "a", "b", "a")

# If there are more than one column, the algorithm can not choose the correct column
self.assertRaises(ValueError, to_tsdata, test_dict,
"id", None, None, None)
Expand Down Expand Up @@ -171,3 +215,7 @@ def test_with_wrong_input(self):
test_df = pd.DataFrame([{"id": 0, "sort": 0}])
self.assertRaises(ValueError, to_tsdata, test_df,
"id", None, None, "sort")

test_df = [1, 2, 3]
self.assertRaises(ValueError, to_tsdata, test_df,
"a", "b", "c", "d")
52 changes: 34 additions & 18 deletions tsfresh/feature_extraction/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,16 @@ def _check_nan(df, *columns):
raise ValueError("Column must not contain NaN values: {}".format(col))


class WideTsFrameAdapter(SliceableTsData):
def _get_value_columns(df, *other_columns):
value_columns = [col for col in df.columns if col not in other_columns]

if len(value_columns) == 0:
raise ValueError("Could not guess the value column! Please hand it to the function as an argument.")

return value_columns


class WideTsFrameAdapter(SliceableTsData):
def __init__(self, df, column_id, column_sort=None, value_columns=None):
"""
Adapter for Pandas DataFrames in wide format, where multiple columns contain different time series for
Expand All @@ -164,17 +172,16 @@ def __init__(self, df, column_id, column_sort=None, value_columns=None):
:type column_sort: str|None
:param value_columns: list of column names to treat as time series values.
If `None`, all columns except `column_id` and `column_sort` will be used.
If `None` or empty, all columns except `column_id` and `column_sort` will be used.
:type value_columns: list[str]|None
"""
if column_id is None:
raise ValueError("A value for column_id needs to be supplied")

_check_nan(df, column_id)

if value_columns is None:
value_columns = [col for col in df.columns if col not in [column_id, column_sort]]

if len(value_columns) == 0:
raise ValueError("You must provide at least one value column")
value_columns = _get_value_columns(df, column_id, column_sort)

_check_nan(df, *value_columns)
_check_colname(*value_columns)
Expand Down Expand Up @@ -208,7 +215,6 @@ def slice(self, offset, length=None):


class LongTsFrameAdapter(TsData):

def __init__(self, df, column_id, column_kind, column_value, column_sort=None):
"""
Adapter for Pandas DataFrames in long format, where different time series for the same id are
Expand All @@ -229,12 +235,22 @@ def __init__(self, df, column_id, column_kind, column_value, column_sort=None):
:param column_sort: the name of the column to sort on
:type column_sort: str|None
"""
if column_id is None:
raise ValueError("A value for column_id needs to be supplied")
if column_kind is None:
raise ValueError("A value for column_kind needs to be supplied")

if column_value is None:
possible_value_columns = _get_value_columns(df, column_id, column_sort, column_kind)
if len(possible_value_columns) != 1:
raise ValueError("Could not guess the value column! Please hand it to the function as an argument.")
self.column_value = possible_value_columns[0]
else:
self.column_value = column_value

_check_nan(df, column_id, column_kind, column_value)
_check_nan(df, column_id, column_kind, self.column_value)
_check_colname(column_kind)

self.column_value = column_value

if column_sort is not None:
_check_nan(df, column_sort)
self.df_grouped = df.sort_values([column_sort]).groupby([column_id, column_kind])
Expand Down Expand Up @@ -291,7 +307,7 @@ def __len__(self):
return sum(grouped_df.ngroups for grouped_df in self.grouped_dict.values())


def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_sort=None):
def to_tsdata(df, column_id, column_kind=None, column_value=None, column_sort=None):
"""
Wrap supported data formats as a TsData object, i.e. an iterable of individual time series.
Expand All @@ -316,7 +332,7 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so
:type df: pd.DataFrame|dict|TsData
:param column_id: The name of the id column to group by.
:type column_id: str|None
:type column_id: str
:param column_kind: The name of the column keeping record on the kind of the value.
:type column_kind: str|None
Expand All @@ -335,13 +351,13 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so
return df

elif isinstance(df, pd.DataFrame):
if column_value is not None:
if column_kind is not None:
return LongTsFrameAdapter(df, column_id, column_kind, column_value, column_sort)
else:
return WideTsFrameAdapter(df, column_id, column_sort, [column_value])
if column_kind is not None:
return LongTsFrameAdapter(df, column_id, column_kind, column_value, column_sort)
else:
return WideTsFrameAdapter(df, column_id, column_sort)
if column_value is not None:
return WideTsFrameAdapter(df, column_id, column_sort, [column_value])
else:
return WideTsFrameAdapter(df, column_id, column_sort)

elif isinstance(df, dict):
return TsDictAdapter(df, column_id, column_value, column_sort)
Expand Down

0 comments on commit 2f8f0fa

Please sign in to comment.