/
column_selectors.py
125 lines (90 loc) · 3.52 KB
/
column_selectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from abc import abstractmethod
import numpy as np
import pandas as pd
from evalml.pipelines.components.transformers import Transformer
class ColumnSelector(Transformer):
def __init__(self, columns=None, random_state=0, **kwargs):
"""Initalizes an transformer that drops specified columns in input data.
Arguments:
columns (list(string)): List of column names, used to determine which columns to drop.
"""
if columns and not isinstance(columns, list):
raise ValueError(f"Parameter columns must be a list. Received {type(columns)}.")
parameters = {"columns": columns}
parameters.update(kwargs)
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)
def _check_input_for_columns(self, X):
cols = self.parameters.get("columns") or []
if isinstance(X, np.ndarray):
column_names = range(X.shape[1])
else:
column_names = X.columns
missing_cols = set(cols) - set(column_names)
if missing_cols:
raise ValueError(
"Columns {} not found in input data".format(', '.join(f"'{col_name}'" for col_name in missing_cols))
)
@abstractmethod
def _modify_columns(self, cols, X, y=None):
"""How the transformer modifies the columns of the input data."""
def fit(self, X, y=None):
"""'Fits' the transformer by checking if the column names are present in the dataset.
Arguments:
X (pd.DataFrame): Data to check.
y (pd.Series, optional): Targets.
Returns:
None.
"""
self._check_input_for_columns(X)
return self
def transform(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self._check_input_for_columns(X)
cols = self.parameters.get("columns") or []
return self._modify_columns(cols, X, y)
def fit_transform(self, X, y=None):
"""Fit transformer to data, then transform data.
Arguments:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Targets.
Returns:
pd.DataFrame: Transformed X.
"""
# transform method already calls fit under the hood.
self.fit(X, y)
return self.transform(X, y)
class DropColumns(ColumnSelector):
"""Drops specified columns in input data."""
name = "Drop Columns Transformer"
hyperparameter_ranges = {}
needs_fitting = False
def _modify_columns(self, cols, X, y=None):
return X.drop(columns=cols, axis=1)
def transform(self, X, y=None):
"""Transforms data X by dropping columns.
Arguments:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Targets.
Returns:
pd.DataFrame: Transformed X.
"""
return super().transform(X, y)
class SelectColumns(ColumnSelector):
"""Selects specified columns in input data."""
name = "Select Columns Transformer"
hyperparameter_ranges = {}
needs_fitting = False
def _modify_columns(self, cols, X, y=None):
return X[cols]
def transform(self, X, y=None):
"""Transforms data X by selecting columns.
Arguments:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Targets.
Returns:
pd.DataFrame: Transformed X.
"""
return super().transform(X, y)