-
Notifications
You must be signed in to change notification settings - Fork 91
Update components and pipelines to return Woodwork data structures #1668
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
393052b
2746e74
7260503
f1d74c2
1aee7f4
9abf008
d311015
3c27d68
482d2f1
2ad0526
001dc59
a24468d
f4eb8c0
49c8400
ab8789f
0865d32
c99808b
704c1c4
c348d79
231523b
a93c8b3
4b519f5
094698d
559fa05
72a0b9d
86f4dbc
350aa69
fc45967
d619154
90a4190
fa9f098
8e22a48
bb10ab1
8699d37
94e4655
93dcc79
f247141
2e7cfe6
4c2666d
b2244b9
da5c82d
18a7ee4
8cc1e52
318d785
b27eca8
20a8116
f43d4d1
5430ebc
16f58a9
b241e39
12c30ae
3e0fe81
2d4a0e6
b90f915
03662c6
1c4d1cf
4425ad7
646c985
0295348
326b550
d6fec28
95f11f7
cc95be1
a3fd671
37f8028
a713574
f18b64d
1ccb152
b0dd92e
afe0741
69c83ca
7c38b62
8ce1c11
1171abf
4dcddb9
0bc7136
7b4e4e5
c21c3aa
316d7d2
0c69dfb
152d1ce
0ac7bb9
23edbce
35d5972
96505ec
17ed8be
1164572
f88392d
8ec9d01
7a4aae5
37594b3
66c04f4
61d0ac0
e0a839b
9394045
e7233f7
6af7dfb
c163d98
9ecbace
ed9cbd6
bb1337d
6c1ef89
94ad165
e144a0a
6e41ee7
0d8d540
ade7fb5
c084c4d
a8149f7
bd182e1
11dd1af
ecd99b1
8692830
253ced5
f8eb2b1
0babaca
72389b7
f6d343f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -148,8 +148,11 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import pandas as pd\n", | ||
| "from evalml.pipelines.components import Transformer\n", | ||
| "from evalml.utils.gen_utils import (\n", | ||
| " _convert_to_woodwork_structure,\n", | ||
| " _convert_woodwork_types_wrapper\n", | ||
| ")\n", | ||
| "\n", | ||
| "class DropNullColumns(Transformer):\n", | ||
| " \"\"\"Transformer to drop features whose percentage of NaN values exceeds a specified threshold\"\"\"\n", | ||
|
|
@@ -175,10 +178,19 @@ | |
| " random_state=random_state)\n", | ||
| "\n", | ||
| " def fit(self, X, y=None):\n", | ||
| " \"\"\"Fits DropNullColumns component to data\n", | ||
| "\n", | ||
| " Arguments:\n", | ||
| " X (list, ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features]\n", | ||
| " y (list, ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]\n", | ||
| "\n", | ||
| " Returns:\n", | ||
| " self\n", | ||
| " \"\"\"\n", | ||
| " pct_null_threshold = self.parameters[\"pct_null_threshold\"]\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " percent_null = X.isnull().mean()\n", | ||
| " X_t = _convert_to_woodwork_structure(X)\n", | ||
| " X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe())\n", | ||
| " percent_null = X_t.isnull().mean()\n", | ||
| " if pct_null_threshold == 0.0:\n", | ||
| " null_cols = percent_null[percent_null > 0]\n", | ||
| " else:\n", | ||
|
|
@@ -188,16 +200,16 @@ | |
| "\n", | ||
| " def transform(self, X, y=None):\n", | ||
| " \"\"\"Transforms data X by dropping columns that exceed the threshold of null values.\n", | ||
| "\n", | ||
| " Arguments:\n", | ||
| " X (pd.DataFrame): Data to transform\n", | ||
| " y (pd.Series, optional): Targets\n", | ||
| " X (ww.DataTable, pd.DataFrame): Data to transform\n", | ||
| " y (ww.DataColumn, pd.Series, optional): Ignored.\n", | ||
| "\n", | ||
| " Returns:\n", | ||
| " pd.DataFrame: Transformed X\n", | ||
| " ww.DataTable: Transformed X\n", | ||
| " \"\"\"\n", | ||
| "\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " return X.drop(columns=self._cols_to_drop, axis=1)" | ||
| " X_t = _convert_to_woodwork_structure(X)\n", | ||
| " return X_t.drop(self._cols_to_drop)" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -214,9 +226,9 @@ | |
| "\n", | ||
| "- `__init__()` - the `__init__()` method of your transformer will need to call `super().__init__()` and pass three parameters in: a `parameters` dictionary holding the parameters to the component, the `component_obj`, and the `random_state` value. You can see that `component_obj` is set to `None` above and we will discuss `component_obj` in depth later on.\n", | ||
| "\n", | ||
| "- `fit()` - the `fit()` method is responsible for fitting your component on training data.\n", | ||
| "- `fit()` - the `fit()` method is responsible for fitting your component on training data. It should return the component object.\n", | ||
| "\n", | ||
| "- `transform()` - after fitting a component, the `transform()` method will take in new data and transform accordingly. Note: a component must call `fit()` before `transform()`.\n", | ||
| "- `transform()` - after fitting a component, the `transform()` method will take in new data and transform accordingly. It should return a Woodwork DataTable. Note: a component must call `fit()` before `transform()`.\n", | ||
| "\n", | ||
| "You can also call or override `fit_transform()` that combines `fit()` and `transform()` into one method." | ||
| ] | ||
|
|
@@ -252,14 +264,14 @@ | |
| " name = \"Baseline Regressor\"\n", | ||
| " hyperparameter_ranges = {}\n", | ||
| " model_family = ModelFamily.BASELINE\n", | ||
| " supported_problem_types = [ProblemTypes.REGRESSION]\n", | ||
| " supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]\n", | ||
| "\n", | ||
| " def __init__(self, strategy=\"mean\", random_state=0, **kwargs):\n", | ||
| " \"\"\"Baseline regressor that uses a simple strategy to make predictions.\n", | ||
| "\n", | ||
| " Arguments:\n", | ||
| " strategy (str): Method used to predict. Valid options are \"mean\", \"median\". Defaults to \"mean\".\n", | ||
| " random_state (int): Seed for the random number generator\n", | ||
| " random_state (int): Seed for the random number generator. Defaults to 0.\n", | ||
| "\n", | ||
| " \"\"\"\n", | ||
| " if strategy not in [\"mean\", \"median\"]:\n", | ||
|
|
@@ -276,9 +288,9 @@ | |
| " def fit(self, X, y=None):\n", | ||
| " if y is None:\n", | ||
| " raise ValueError(\"Cannot fit Baseline regressor if y is None\")\n", | ||
| "\n", | ||
| " if not isinstance(y, pd.Series):\n", | ||
| " y = pd.Series(y)\n", | ||
| " X = _convert_to_woodwork_structure(X)\n", | ||
| " y = _convert_to_woodwork_structure(y)\n", | ||
| " y = _convert_woodwork_types_wrapper(y.to_series())\n", | ||
| "\n", | ||
| " if self.parameters[\"strategy\"] == \"mean\":\n", | ||
| " self._prediction_value = y.mean()\n", | ||
|
|
@@ -288,7 +300,9 @@ | |
| " return self\n", | ||
| "\n", | ||
| " def predict(self, X):\n", | ||
| " return pd.Series([self._prediction_value] * len(X))\n", | ||
| " X = _convert_to_woodwork_structure(X)\n", | ||
| " predictions = pd.Series([self._prediction_value] * len(X))\n", | ||
| " return _convert_to_woodwork_structure(predictions)\n", | ||
| "\n", | ||
| " @property\n", | ||
| " def feature_importance(self):\n", | ||
|
|
@@ -298,7 +312,7 @@ | |
| " np.ndarray (float): An array of zeroes\n", | ||
| "\n", | ||
| " \"\"\"\n", | ||
| " return np.zeros(self._num_features)" | ||
| " return np.zeros(self._num_features)\n" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -402,45 +416,6 @@ | |
| "AutoML will perform a search over the allowed ranges for each parameter to select models which produce optimal performance within those ranges. AutoML gets the allowed ranges for each component from the component's `hyperparameter_ranges` class attribute. Any component parameter you add an entry for in `hyperparameter_ranges` will be included in the AutoML search. If parameters are omitted, AutoML will use the default value in all pipelines. " | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "from sklearn.linear_model import LinearRegression as SKLinearRegression\n", | ||
| "\n", | ||
| "from evalml.model_family import ModelFamily\n", | ||
| "from evalml.pipelines.components.estimators import Estimator\n", | ||
| "from evalml.problem_types import ProblemTypes\n", | ||
| "\n", | ||
| "class LinearRegressor(Estimator):\n", | ||
| " \"\"\"Linear Regressor.\"\"\"\n", | ||
| " name = \"Linear Regressor\"\n", | ||
| " hyperparameter_ranges = {\n", | ||
| " 'fit_intercept': [True, False],\n", | ||
| " 'normalize': [True, False]\n", | ||
| " }\n", | ||
| " model_family = ModelFamily.LINEAR_MODEL\n", | ||
| " supported_problem_types = [ProblemTypes.REGRESSION]\n", | ||
| "\n", | ||
| " def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1, random_state=0, **kwargs):\n", | ||
| " parameters = {\n", | ||
| " 'fit_intercept': fit_intercept,\n", | ||
| " 'normalize': normalize,\n", | ||
| " 'n_jobs': n_jobs\n", | ||
| " }\n", | ||
| " parameters.update(kwargs)\n", | ||
| " linear_regressor = SKLinearRegression(**parameters)\n", | ||
| " super().__init__(parameters=parameters,\n", | ||
| " component_obj=linear_regressor,\n", | ||
| " random_state=random_state)\n", | ||
| "\n", | ||
| " @property\n", | ||
| " def feature_importance(self):\n", | ||
| " return self._component_obj.coef_" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
|
|
@@ -471,8 +446,7 @@ | |
| "outputs": [], | ||
| "source": [ | ||
| "# this string can then be copy and pasted into a separate window and executed as python code\n", | ||
| "exec(code)\n", | ||
| "logisticRegressionClassifier" | ||
| "exec(code)" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -481,60 +455,10 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "# custom component\n", | ||
| "from evalml.pipelines.components import Transformer\n", | ||
| "import pandas as pd\n", | ||
| "# We can also do this for custom components\n", | ||
| "from evalml.pipelines.components.utils import generate_component_code\n", | ||
| "\n", | ||
| "class MyDropNullColumns(Transformer):\n", | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to repeat this, I believe the only difference is the name and there's nothing special about this necessary for code gen so deleting! |
||
| " \"\"\"Transformer to drop features whose percentage of NaN values exceeds a specified threshold\"\"\"\n", | ||
| " name = \"My Drop Null Columns Transformer\"\n", | ||
| " hyperparameter_ranges = {}\n", | ||
| "\n", | ||
| " def __init__(self, pct_null_threshold=1.0, random_state=0, **kwargs):\n", | ||
| " \"\"\"Initalizes an transformer to drop features whose percentage of NaN values exceeds a specified threshold.\n", | ||
| "\n", | ||
| " Arguments:\n", | ||
| " pct_null_threshold(float): The percentage of NaN values in an input feature to drop.\n", | ||
| " Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values.\n", | ||
| " If equal to 1.0, will drop columns with all null values. Defaults to 0.95.\n", | ||
| " \"\"\"\n", | ||
| " if pct_null_threshold < 0 or pct_null_threshold > 1:\n", | ||
| " raise ValueError(\"pct_null_threshold must be a float between 0 and 1, inclusive.\")\n", | ||
| " parameters = {\"pct_null_threshold\": pct_null_threshold}\n", | ||
| " parameters.update(kwargs)\n", | ||
| "\n", | ||
| " self._cols_to_drop = None\n", | ||
| " super().__init__(parameters=parameters,\n", | ||
| " component_obj=None,\n", | ||
| " random_state=random_state)\n", | ||
| "\n", | ||
| " def fit(self, X, y=None):\n", | ||
| " pct_null_threshold = self.parameters[\"pct_null_threshold\"]\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " percent_null = X.isnull().mean()\n", | ||
| " if pct_null_threshold == 0.0:\n", | ||
| " null_cols = percent_null[percent_null > 0]\n", | ||
| " else:\n", | ||
| " null_cols = percent_null[percent_null >= pct_null_threshold]\n", | ||
| " self._cols_to_drop = list(null_cols.index)\n", | ||
| " return self\n", | ||
| "\n", | ||
| " def transform(self, X, y=None):\n", | ||
| " \"\"\"Transforms data X by dropping columns that exceed the threshold of null values.\n", | ||
| " Arguments:\n", | ||
| " X (pd.DataFrame): Data to transform\n", | ||
| " y (pd.Series, optional): Targets\n", | ||
| " Returns:\n", | ||
| " pd.DataFrame: Transformed X\n", | ||
| " \"\"\"\n", | ||
| "\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " return X.drop(columns=self._cols_to_drop, axis=1)\n", | ||
| " \n", | ||
| "myDropNull = MyDropNullColumns()\n", | ||
| "myDropNull = DropNullColumns()\n", | ||
| "print(generate_component_code(myDropNull))" | ||
| ] | ||
| }, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -389,6 +389,7 @@ | |
| "from evalml.pipelines.utils import generate_pipeline_code\n", | ||
| "from evalml.pipelines import MulticlassClassificationPipeline\n", | ||
| "import pandas as pd\n", | ||
| "from evalml.utils import _convert_to_woodwork_structure, _convert_woodwork_types_wrapper\n", | ||
| "\n", | ||
| "class MyDropNullColumns(Transformer):\n", | ||
| " \"\"\"Transformer to drop features whose percentage of NaN values exceeds a specified threshold\"\"\"\n", | ||
|
|
@@ -415,8 +416,8 @@ | |
| "\n", | ||
| " def fit(self, X, y=None):\n", | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updating demo code to be a transformer that handles woodwork :') necessary for fitting of pipeline below! |
||
| " pct_null_threshold = self.parameters[\"pct_null_threshold\"]\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " X = _convert_to_woodwork_structure(X)\n", | ||
| " X = _convert_woodwork_types_wrapper(X.to_dataframe())\n", | ||
| " percent_null = X.isnull().mean()\n", | ||
| " if pct_null_threshold == 0.0:\n", | ||
| " null_cols = percent_null[percent_null > 0]\n", | ||
|
|
@@ -434,9 +435,9 @@ | |
| " pd.DataFrame: Transformed X\n", | ||
| " \"\"\"\n", | ||
| "\n", | ||
| " if not isinstance(X, pd.DataFrame):\n", | ||
| " X = pd.DataFrame(X)\n", | ||
| " return X.drop(columns=self._cols_to_drop, axis=1)\n", | ||
| " X = _convert_to_woodwork_structure(X)\n", | ||
| " return X.drop(columns=self._cols_to_drop)\n", | ||
| "\n", | ||
| "\n", | ||
| "class CustomPipeline(MulticlassClassificationPipeline):\n", | ||
| " name = \"Custom Pipeline\"\n", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,7 +157,7 @@ def __init__(self, | |
| additional_objectives (list): Custom set of objectives to score on. | ||
| Will override default objectives for problem type if not empty. | ||
|
|
||
| random_state (int): The random seed. Defaults to 0. | ||
| random_state (int): Seed for the random number generator. Defaults to 0. | ||
|
|
||
| n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. | ||
| None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. | ||
|
|
@@ -566,8 +566,8 @@ def _tune_binary_threshold(self, pipeline, X_threshold_tuning, y_threshold_tunin | |
|
|
||
| Arguments: | ||
| pipeline (Pipeline): Pipeline instance to threshold | ||
| X_threshold_tuning (ww DataTable): X data to tune pipeline to | ||
| y_threshold_tuning (ww DataColumn): Target data to tune pipeline to | ||
| X_threshold_tuning (ww.DataTable): X data to tune pipeline to | ||
| y_threshold_tuning (ww.DataColumn): Target data to tune pipeline to | ||
|
|
||
| Returns: | ||
| Trained pipeline instance | ||
|
|
@@ -576,10 +576,7 @@ def _tune_binary_threshold(self, pipeline, X_threshold_tuning, y_threshold_tunin | |
| pipeline.threshold = 0.5 | ||
| if X_threshold_tuning: | ||
| y_predict_proba = pipeline.predict_proba(X_threshold_tuning) | ||
| if isinstance(y_predict_proba, pd.DataFrame): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cleanup: we no longer need to worry about whether y_predict_proba is returning np/pd, so no need for this check :D |
||
| y_predict_proba = y_predict_proba.iloc[:, 1] | ||
| else: | ||
| y_predict_proba = y_predict_proba[:, 1] | ||
| y_predict_proba = y_predict_proba.iloc[:, 1] | ||
| pipeline.threshold = self.objective.optimize_threshold(y_predict_proba, y_threshold_tuning, X=X_threshold_tuning) | ||
| return pipeline | ||
|
|
||
|
|
@@ -849,7 +846,7 @@ def get_pipeline(self, pipeline_id, random_state=0): | |
|
|
||
| Arguments: | ||
| pipeline_id (int): pipeline to retrieve | ||
| random_state (int): The random seed. Defaults to 0. | ||
| random_state (int): Seed for the random number generator. Defaults to 0. | ||
|
|
||
| Returns: | ||
| PipelineBase: untrained pipeline instance associated with the provided ID | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is accidental duplicate code, deleting 😱