Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Select available columns in Select Columns Transformer #2944

Merged
merged 9 commits into from
Oct 21, 2021
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Release Notes
* Fixes
* Fixed bug where partial dependence was not respecting the ww schema :pr:`2929`
* Fixed ``calculate_permutation_importance`` for datetimes on ``StandardScaler`` :pr:`2938`
* Fixed ``SelectColumns`` to only select available features for feature selection in ``DefaultAlgorithm`` :pr:`2944`
* Changes
* Changed ``make_pipeline`` function to place the ``DateTimeFeaturizer`` prior to the ``Imputer`` so that ``NaN`` dates can be imputed :pr:`2909`
* Documentation Changes
Expand Down
22 changes: 14 additions & 8 deletions evalml/pipelines/components/transformers/column_selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class SelectColumns(ColumnSelector):
"""Selects specified columns in input data.

Args:
columns (list(string)): List of column names, used to determine which columns to select.
columns (list(string)): List of column names, used to determine which columns to select. If columns are not present, they will not be selected.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""

Expand All @@ -110,20 +110,26 @@ class SelectColumns(ColumnSelector):
"""{}"""
needs_fitting = False

def _modify_columns(self, cols, X, y=None):
return X.ww[cols]
def _check_input_for_columns(self, X):
pass

def transform(self, X, y=None):
"""Transforms data X by selecting columns.
def fit(self, X, y=None):
"""Fits the transformer by checking if column names are present in the dataset.

Args:
X (pd.DataFrame): Data to transform.
X (pd.DataFrame): Data to check.
y (pd.Series, optional): Targets.

Returns:
pd.DataFrame: Transformed X.
self
"""
return super().transform(X, y)
return self

def _modify_columns(self, cols, X, y=None):
column_intersection = list(
sorted(set(cols).intersection(X.columns), key=cols.index)
)
return X.ww[column_intersection]


class SelectByType(ColumnSelector):
Expand Down
21 changes: 15 additions & 6 deletions evalml/tests/component_tests/test_column_selector_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ def test_column_transformer_empty_X(class_to_test):
transformer = class_to_test(column_types=["not in data"])
else:
transformer = class_to_test(columns=["not in data"])
with pytest.raises(ValueError, match="not found in input data"):
transformer.fit(X)
if class_to_test is not SelectColumns:
with pytest.raises(ValueError, match="not found in input data"):
transformer.fit(X)

transformer = class_to_test(columns=list(X.columns))
assert transformer.transform(X).empty
Expand Down Expand Up @@ -187,10 +188,10 @@ def test_column_transformer_fit_transform(class_to_test, checking_functions):
assert check3(X, class_to_test(columns=list(X.columns)).fit_transform(X))


@pytest.mark.parametrize("class_to_test", [DropColumns, SelectColumns])
def test_drop_column_transformer_input_invalid_col_name(class_to_test):
def test_drop_column_transformer_input_invalid_col_name():
X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]})
transformer = class_to_test(columns=["not in data"])
transformer = DropColumns(columns=["not in data"])

with pytest.raises(ValueError, match="not found in input data"):
transformer.fit(X)
with pytest.raises(ValueError, match="not found in input data"):
Expand All @@ -199,7 +200,7 @@ def test_drop_column_transformer_input_invalid_col_name(class_to_test):
transformer.fit_transform(X)

X = np.arange(12).reshape(3, 4)
transformer = class_to_test(columns=[5])
transformer = DropColumns(columns=[5])
with pytest.raises(ValueError, match="not found in input data"):
transformer.fit(X)
with pytest.raises(ValueError, match="not found in input data"):
Expand Down Expand Up @@ -282,3 +283,11 @@ def test_typeortag_column_transformer_ww_logical_and_semantic_types():

X_t = SelectByType(column_types=["numeric"]).fit_transform(X)
assert X_t.astype(str).equals(X[["three", "four"]].astype(str))


def test_column_selector_missing_columns():
selector = SelectColumns(columns=["A", "B", "C", "D"])
X = pd.DataFrame(columns=["A", "C", "F", "G"])

X_t = selector.fit_transform(X)
assert (X_t.columns == ["A", "C"]).all()