From 23b0efaf95010eca882192b5d5679d5a9cc0490c Mon Sep 17 00:00:00 2001 From: Maximilian Christ Date: Sun, 9 Sep 2018 11:48:46 +0200 Subject: [PATCH] Improve warnings (#433) * improve warnings and sanity checks for select features --- .../test_relevant_feature_extraction.py | 38 +++++++++++++++---- .../units/feature_selection/test_selection.py | 15 +++++--- .../utilities/test_dataframe_functions.py | 16 +++++++- tsfresh/convenience/relevant_extraction.py | 16 +++++++- .../feature_extraction/feature_calculators.py | 4 +- tsfresh/feature_selection/selection.py | 21 +++++----- tsfresh/utilities/dataframe_functions.py | 22 +++++++++++ 7 files changed, 102 insertions(+), 30 deletions(-) diff --git a/tests/integrations/test_relevant_feature_extraction.py b/tests/integrations/test_relevant_feature_extraction.py index e7fd66e45..4a437b23f 100644 --- a/tests/integrations/test_relevant_feature_extraction.py +++ b/tests/integrations/test_relevant_feature_extraction.py @@ -50,17 +50,16 @@ class RelevantFeatureExtractionTestCase(TestCase): def setUp(self): np.random.seed(42) - - y = pd.Series(np.random.binomial(1, 0.5, 10), index=range(10)) + y = pd.Series(np.random.binomial(1, 0.5, 20), index=range(20)) df = pd.DataFrame(index=range(100)) df["a"] = np.random.normal(0, 1, 100) df["b"] = np.random.normal(0, 1, 100) - df["id"] = np.repeat(range(10), 10) + df["id"] = np.repeat(range(20), 5) - X = pd.DataFrame(index=range(10)) - X["f1"] = np.random.normal(0, 1, 10) - X["f2"] = np.random.normal(0, 1, 10) + X = pd.DataFrame(index=range(20)) + X["f1"] = np.random.normal(0, 1, 20) + X["f2"] = np.random.normal(0, 1, 20) self.df = df self.X = X @@ -87,4 +86,29 @@ def test_extraction_null_as_column_name(self): df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)}) X3 = extract_features(df3, column_id=2, column_sort=1) - self.assertEqual(len(X3), 2) \ No newline at end of file + self.assertEqual(len(X3), 2) + + def test_raises_mismatch_index_df_and_y_df_more(self): + y = pd.Series(range(3), index=[1, 2, 3]) + df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} + self.assertRaises(ValueError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + + def test_raises_mismatch_index_df_and_y_y_more(self): + y = pd.Series(range(4), index=[1, 2, 3, 4]) + df = pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}) + self.assertRaises(ValueError, extract_relevant_features, df, y, None, None, None, "id", None, "val") + + def test_raises_y_not_series(self): + y = np.arange(10) + df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} + self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + + def test_raises_y_not_more_than_one_label(self): + y = pd.Series(1, index=[1, 2, 3]) + df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} + self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + + diff --git a/tests/units/feature_selection/test_selection.py b/tests/units/feature_selection/test_selection.py index 2d7c30866..8f9716a02 100644 --- a/tests/units/feature_selection/test_selection.py +++ b/tests/units/feature_selection/test_selection.py @@ -12,20 +12,19 @@ class TestSelectFeatures: def test_assert_list(self): - with pytest.raises(TypeError): + with pytest.raises(AssertionError): select_features(pd.DataFrame(index=range(2)),[1,2,3]) - def test_assert_one_row_X(self): X = pd.DataFrame([1], index=[1]) y = pd.Series([1], index=[1]) - with pytest.raises(ValueError): + with pytest.raises(AssertionError): select_features(X, y) def test_assert_one_label_y(self): X = pd.DataFrame([10, 10], index=[1, 2]) y = pd.Series([1, 1], index=[1, 2]) - with pytest.raises(ValueError): + with pytest.raises(AssertionError): select_features(X, y) def test_assert_different_index(self): @@ -34,13 +33,17 @@ def test_assert_different_index(self): with pytest.raises(ValueError): select_features(X, y) - def test_assert_shorter_y(self): X = pd.DataFrame([1, 2], index=[1, 2]) y = np.array([1]) - with pytest.raises(ValueError): + with pytest.raises(AssertionError): select_features(X, y) + def test_assert_X_is_DataFrame(self): + X = np.array([[1, 2], [1, 2]]) + y = np.array([1]) + with pytest.raises(AssertionError): + select_features(X, y) def test_selects_for_each_class(self): df = pd.DataFrame() diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 852d0cdda..593910a41 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -10,6 +10,8 @@ import six from pandas.testing import assert_frame_equal, assert_series_equal +from tsfresh.utilities.dataframe_functions import get_ids + class NormalizeTestCase(TestCase): def test_with_dictionaries_one_row(self): @@ -647,4 +649,16 @@ def test_make_forecasting_frame_pdSeries(self): "2011-01-01 02:00:00"]) }) assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1)) - assert_series_equal(y, expected_y) \ No newline at end of file + assert_series_equal(y, expected_y) + + +class GetIDsTestCase(TestCase): + + def test_get_id__correct_DataFrame(self): + df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}) + self.assertEqual(get_ids(df, "id"), set([1, 2])) + + def test_get_id__correct_dict(self): + df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} + self.assertEqual(get_ids(df_dict, "id"), set([1, 2, 3, 4])) diff --git a/tsfresh/convenience/relevant_extraction.py b/tsfresh/convenience/relevant_extraction.py index cca2951ba..99910e9bc 100644 --- a/tsfresh/convenience/relevant_extraction.py +++ b/tsfresh/convenience/relevant_extraction.py @@ -7,7 +7,7 @@ from tsfresh.feature_extraction import extract_features from tsfresh import defaults from tsfresh.feature_selection import select_features -from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute +from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute, get_ids def extract_relevant_features(timeseries_container, y, X=None, @@ -133,9 +133,23 @@ def extract_relevant_features(timeseries_container, y, X=None, :return: Feature matrix X, possibly extended with relevant time series features. """ + + assert isinstance(y, pd.Series) + assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided" + if X is not None: timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index) + ids_container = get_ids(df_or_dict=timeseries_container, column_id=column_id) + ids_y = set(y.index) + if ids_container != ids_y: + if len(ids_container - ids_y) > 0: + raise ValueError("The following ids are in the time series container but are missing in y: " + "{}".format(ids_container - ids_y)) + if len(ids_y - ids_container) > 0: + raise ValueError("The following ids are in y but are missing inside the time series container: " + "{}".format(ids_y - ids_container)) + X_ext = extract_features(timeseries_container, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index 41458ffee..48f5c4320 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1789,14 +1789,12 @@ def energy_ratio_by_chunks(x, param): which is the segment number (starting at zero) to return a feature on. If the length of the time series is not a multiple of the number of segments, the remaining data points are - distributed on the bins starting from the first. For example, if your time series consists of 8 datapoints, the + distributed on the bins starting from the first. For example, if your time series consists of 8 entries, the first two bins will contain 3 and the last two values, e.g. `[ 0., 1., 2.], [ 3., 4., 5.]` and `[ 6., 7.]`. Note that the answer for `num_segments = 1` is a trivial "1" but we handle this scenario in case somebody calls it. Sum of the ratios should be 1.0. - Returns an AssertionError for N <= 0 - :param x: the time series to calculate the feature of :type x: pandas.Series :param param: contains dictionaries {"num_segments": N, "segment_focus": i} with N, i both ints diff --git a/tsfresh/feature_selection/selection.py b/tsfresh/feature_selection/selection.py index a3d526834..5a8c3729a 100644 --- a/tsfresh/feature_selection/selection.py +++ b/tsfresh/feature_selection/selection.py @@ -128,21 +128,18 @@ def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FO :raises: ``ValueError`` when the target vector does not fit to the feature matrix or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`. """ + assert isinstance(X, pd.DataFrame), "Please pass features in X as pandas.DataFrame." check_for_nans_in_columns(X) + assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \ + "pandas.Series, numpy.ndarray" + assert len(y) > 1, "y must contain at least two samples." + assert len(X) == len(y), "X and y must contain the same number of samples." + assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided" - if not isinstance(y, (pd.Series, np.ndarray)): - raise TypeError("The type of target vector y must be one of: pandas.Series, numpy.ndarray") - - if len(X) < 2: - raise ValueError("X must contain at least two samples.") - elif len(set(y)) == 1: - raise ValueError("y contains only one kind of label, no feature selection possible.") - elif isinstance(y, pd.Series) and not X.index.isin(y.index).all(): - raise ValueError("Index of X must be a subset of y's index") - elif isinstance(y, np.ndarray): - if not len(y) >= len(X): - raise ValueError("Target vector y is shorter than feature matrix X") + if isinstance(y, pd.Series) and set(X.index) != set(y.index): + raise ValueError("Index of X and y must be identical if provided") + if isinstance(y, np.ndarray): y = pd.Series(y, index=X.index) relevance_table = calculate_relevance_table( diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 41bda426a..7198a7c9c 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -209,6 +209,28 @@ def restrict_input_to_index(df_or_dict, column_id, index): return df_or_dict_restricted +def get_ids(df_or_dict, column_id): + """ + Aggregates all ids in column_id from the time series container ` + + :param df_or_dict: a pandas DataFrame or a dictionary. + :type df_or_dict: pandas.DataFrame or dict + :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary. + It is not allowed to have NaN values in this column. + :type column_id: basestring + + :return: as set with all existing ids in energy_ratio_by_chunks + :rtype: Set + :raise: ``TypeError`` if df_or_dict is not of type dict or pandas.DataFrame + """ + if isinstance(df_or_dict, pd.DataFrame): + return set(df_or_dict[column_id]) + elif isinstance(df_or_dict, dict): + return set.union(*[set(df[column_id]) for _, df in df_or_dict.items()]) + else: + raise TypeError("df_or_dict should be of type dict or pandas.DataFrame") + + # todo: add more testcases # todo: rewrite in a more straightforward way def _normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value):