Skip to content

Commit

Permalink
Improve warnings (#433)
Browse files Browse the repository at this point in the history
* improve warnings and sanity checks for select features
  • Loading branch information
MaxBenChrist committed Sep 9, 2018
1 parent b00229f commit 23b0efa
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 30 deletions.
38 changes: 31 additions & 7 deletions tests/integrations/test_relevant_feature_extraction.py
Expand Up @@ -50,17 +50,16 @@ class RelevantFeatureExtractionTestCase(TestCase):

def setUp(self):
np.random.seed(42)

y = pd.Series(np.random.binomial(1, 0.5, 10), index=range(10))
y = pd.Series(np.random.binomial(1, 0.5, 20), index=range(20))
df = pd.DataFrame(index=range(100))

df["a"] = np.random.normal(0, 1, 100)
df["b"] = np.random.normal(0, 1, 100)
df["id"] = np.repeat(range(10), 10)
df["id"] = np.repeat(range(20), 5)

X = pd.DataFrame(index=range(10))
X["f1"] = np.random.normal(0, 1, 10)
X["f2"] = np.random.normal(0, 1, 10)
X = pd.DataFrame(index=range(20))
X["f1"] = np.random.normal(0, 1, 20)
X["f2"] = np.random.normal(0, 1, 20)

self.df = df
self.X = X
Expand All @@ -87,4 +86,29 @@ def test_extraction_null_as_column_name(self):

df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)})
X3 = extract_features(df3, column_id=2, column_sort=1)
self.assertEqual(len(X3), 2)
self.assertEqual(len(X3), 2)

def test_raises_mismatch_index_df_and_y_df_more(self):
y = pd.Series(range(3), index=[1, 2, 3])
df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertRaises(ValueError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")

def test_raises_mismatch_index_df_and_y_y_more(self):
y = pd.Series(range(4), index=[1, 2, 3, 4])
df = pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
self.assertRaises(ValueError, extract_relevant_features, df, y, None, None, None, "id", None, "val")

def test_raises_y_not_series(self):
y = np.arange(10)
df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")

def test_raises_y_not_more_than_one_label(self):
y = pd.Series(1, index=[1, 2, 3])
df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val")


15 changes: 9 additions & 6 deletions tests/units/feature_selection/test_selection.py
Expand Up @@ -12,20 +12,19 @@

class TestSelectFeatures:
def test_assert_list(self):
with pytest.raises(TypeError):
with pytest.raises(AssertionError):
select_features(pd.DataFrame(index=range(2)),[1,2,3])


def test_assert_one_row_X(self):
X = pd.DataFrame([1], index=[1])
y = pd.Series([1], index=[1])
with pytest.raises(ValueError):
with pytest.raises(AssertionError):
select_features(X, y)

def test_assert_one_label_y(self):
X = pd.DataFrame([10, 10], index=[1, 2])
y = pd.Series([1, 1], index=[1, 2])
with pytest.raises(ValueError):
with pytest.raises(AssertionError):
select_features(X, y)

def test_assert_different_index(self):
Expand All @@ -34,13 +33,17 @@ def test_assert_different_index(self):
with pytest.raises(ValueError):
select_features(X, y)


def test_assert_shorter_y(self):
X = pd.DataFrame([1, 2], index=[1, 2])
y = np.array([1])
with pytest.raises(ValueError):
with pytest.raises(AssertionError):
select_features(X, y)

def test_assert_X_is_DataFrame(self):
X = np.array([[1, 2], [1, 2]])
y = np.array([1])
with pytest.raises(AssertionError):
select_features(X, y)

def test_selects_for_each_class(self):
df = pd.DataFrame()
Expand Down
16 changes: 15 additions & 1 deletion tests/units/utilities/test_dataframe_functions.py
Expand Up @@ -10,6 +10,8 @@
import six
from pandas.testing import assert_frame_equal, assert_series_equal

from tsfresh.utilities.dataframe_functions import get_ids


class NormalizeTestCase(TestCase):
def test_with_dictionaries_one_row(self):
Expand Down Expand Up @@ -647,4 +649,16 @@ def test_make_forecasting_frame_pdSeries(self):
"2011-01-01 02:00:00"])
})
assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
assert_series_equal(y, expected_y)
assert_series_equal(y, expected_y)


class GetIDsTestCase(TestCase):

def test_get_id__correct_DataFrame(self):
df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
self.assertEqual(get_ids(df, "id"), set([1, 2]))

def test_get_id__correct_dict(self):
df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertEqual(get_ids(df_dict, "id"), set([1, 2, 3, 4]))
16 changes: 15 additions & 1 deletion tsfresh/convenience/relevant_extraction.py
Expand Up @@ -7,7 +7,7 @@
from tsfresh.feature_extraction import extract_features
from tsfresh import defaults
from tsfresh.feature_selection import select_features
from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute
from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute, get_ids


def extract_relevant_features(timeseries_container, y, X=None,
Expand Down Expand Up @@ -133,9 +133,23 @@ def extract_relevant_features(timeseries_container, y, X=None,
:return: Feature matrix X, possibly extended with relevant time series features.
"""

assert isinstance(y, pd.Series)
assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided"

if X is not None:
timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index)

ids_container = get_ids(df_or_dict=timeseries_container, column_id=column_id)
ids_y = set(y.index)
if ids_container != ids_y:
if len(ids_container - ids_y) > 0:
raise ValueError("The following ids are in the time series container but are missing in y: "
"{}".format(ids_container - ids_y))
if len(ids_y - ids_container) > 0:
raise ValueError("The following ids are in y but are missing inside the time series container: "
"{}".format(ids_y - ids_container))

X_ext = extract_features(timeseries_container,
default_fc_parameters=default_fc_parameters,
kind_to_fc_parameters=kind_to_fc_parameters,
Expand Down
4 changes: 1 addition & 3 deletions tsfresh/feature_extraction/feature_calculators.py
Expand Up @@ -1789,14 +1789,12 @@ def energy_ratio_by_chunks(x, param):
which is the segment number (starting at zero) to return a feature on.
If the length of the time series is not a multiple of the number of segments, the remaining data points are
distributed on the bins starting from the first. For example, if your time series consists of 8 datapoints, the
distributed on the bins starting from the first. For example, if your time series consists of 8 entries, the
first two bins will contain 3 and the last two values, e.g. `[ 0., 1., 2.], [ 3., 4., 5.]` and `[ 6., 7.]`.
Note that the answer for `num_segments = 1` is a trivial "1" but we handle this scenario
in case somebody calls it. Sum of the ratios should be 1.0.
Returns an AssertionError for N <= 0
:param x: the time series to calculate the feature of
:type x: pandas.Series
:param param: contains dictionaries {"num_segments": N, "segment_focus": i} with N, i both ints
Expand Down
21 changes: 9 additions & 12 deletions tsfresh/feature_selection/selection.py
Expand Up @@ -128,21 +128,18 @@ def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FO
:raises: ``ValueError`` when the target vector does not fit to the feature matrix
or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`.
"""
assert isinstance(X, pd.DataFrame), "Please pass features in X as pandas.DataFrame."
check_for_nans_in_columns(X)
assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \
"pandas.Series, numpy.ndarray"
assert len(y) > 1, "y must contain at least two samples."
assert len(X) == len(y), "X and y must contain the same number of samples."
assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided"

if not isinstance(y, (pd.Series, np.ndarray)):
raise TypeError("The type of target vector y must be one of: pandas.Series, numpy.ndarray")

if len(X) < 2:
raise ValueError("X must contain at least two samples.")
elif len(set(y)) == 1:
raise ValueError("y contains only one kind of label, no feature selection possible.")
elif isinstance(y, pd.Series) and not X.index.isin(y.index).all():
raise ValueError("Index of X must be a subset of y's index")
elif isinstance(y, np.ndarray):
if not len(y) >= len(X):
raise ValueError("Target vector y is shorter than feature matrix X")
if isinstance(y, pd.Series) and set(X.index) != set(y.index):
raise ValueError("Index of X and y must be identical if provided")

if isinstance(y, np.ndarray):
y = pd.Series(y, index=X.index)

relevance_table = calculate_relevance_table(
Expand Down
22 changes: 22 additions & 0 deletions tsfresh/utilities/dataframe_functions.py
Expand Up @@ -209,6 +209,28 @@ def restrict_input_to_index(df_or_dict, column_id, index):
return df_or_dict_restricted


def get_ids(df_or_dict, column_id):
"""
Aggregates all ids in column_id from the time series container `
:param df_or_dict: a pandas DataFrame or a dictionary.
:type df_or_dict: pandas.DataFrame or dict
:param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
It is not allowed to have NaN values in this column.
:type column_id: basestring
:return: as set with all existing ids in energy_ratio_by_chunks
:rtype: Set
:raise: ``TypeError`` if df_or_dict is not of type dict or pandas.DataFrame
"""
if isinstance(df_or_dict, pd.DataFrame):
return set(df_or_dict[column_id])
elif isinstance(df_or_dict, dict):
return set.union(*[set(df[column_id]) for _, df in df_or_dict.items()])
else:
raise TypeError("df_or_dict should be of type dict or pandas.DataFrame")


# todo: add more testcases
# todo: rewrite in a more straightforward way
def _normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value):
Expand Down

0 comments on commit 23b0efa

Please sign in to comment.