Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Objectives to take in list inputs #1663

Merged
merged 11 commits into from
Jan 11, 2021
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Added multiclass check to ``InvalidTargetDataCheck`` for two examples per class :pr:`1596`
* Support graphviz 0.16 :pr:`1657`
* Enhanced time series pipelines to accept empty features :pr:`1651`
* Added support for list inputs for objectives :pr:`1663`
* Fixes
* Fixed thresholding for pipelines in ``AutoMLSearch`` to only threshold binary classification pipelines :pr:`1622` :pr:`1626`
* Updated ``load_data`` to return Woodwork structures and update default parameter value for ``index`` to ``None`` :pr:`1610`
Expand Down
11 changes: 8 additions & 3 deletions evalml/objectives/objective_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _standardize_input_type(input_data):
"""Standardize input to pandas for scoring.

Arguments:
input_data (ww.DataTable, ww.DataColumn, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities
input_data (list, ww.DataTable, ww.DataColumn, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities

Returns:
pd.DataFrame or pd.Series: a pd.Series, or pd.DataFrame object if predicted probabilities were provided.
Expand All @@ -87,9 +87,14 @@ def _standardize_input_type(input_data):
return _convert_woodwork_types_wrapper(input_data.to_dataframe())
if isinstance(input_data, ww.DataColumn):
return _convert_woodwork_types_wrapper(input_data.to_series())
if len(input_data.shape) == 1:
if isinstance(input_data, list):
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(input_data[0], list):
return pd.DataFrame(input_data)
return pd.Series(input_data)
return pd.DataFrame(input_data)
if isinstance(input_data, np.ndarray):
if len(input_data.shape) == 1:
return pd.Series(input_data)
return pd.DataFrame(input_data)

def validate_inputs(self, y_true, y_predicted):
"""Validates the input based on a few simple checks.
Expand Down
4 changes: 2 additions & 2 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def fit(self, X, y=None):
"""Fits component to data

Arguments:
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]
X (list, ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (list, ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]

Returns:
self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def _encode_categories(self, X, fit=False):
return X_encoded

def _encode_labels(self, y):
y_encoded = pd.Series(y)
y_encoded = _convert_to_woodwork_structure(y)
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
y_encoded = _convert_woodwork_types_wrapper(y_encoded.to_series())
# change only if dtype isn't int
if not is_integer_dtype(y_encoded):
self._label_encoder = LabelEncoder()
Expand Down
10 changes: 9 additions & 1 deletion evalml/pipelines/components/transformers/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from evalml.exceptions import MethodPropertyNotFoundError
from evalml.model_family import ModelFamily
from evalml.pipelines.components import ComponentBase
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class Transformer(ComponentBase):
Expand Down Expand Up @@ -47,7 +51,11 @@ def fit_transform(self, X, y=None):
pd.DataFrame: Transformed X
"""
try:
X_t = self._component_obj.fit_transform(X, y)
X2 = _convert_to_woodwork_structure(X)
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
y2 = _convert_to_woodwork_structure(y)
X2 = _convert_woodwork_types_wrapper(X2.to_dataframe())
y2 = _convert_woodwork_types_wrapper(y2.to_series())
X_t = self._component_obj.fit_transform(X2, y2)
except AttributeError:
try:
self.fit(X, y)
Expand Down
15 changes: 5 additions & 10 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1050,10 +1050,10 @@ def test_results_getter(mock_fit, mock_score, X_y_binary):
assert automl.results['pipeline_results'][0]['score'] == 1.0


@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww'])
@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww'])
@pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS])
@pytest.mark.parametrize("target_type", ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'category', 'object', 'Int64', 'boolean'])
def test_targets_pandas_data_types_classification(data_type, automl_type, target_type):
def test_targets_pandas_data_types_classification(data_type, automl_type, target_type, make_data_type):
if data_type == 'np' and target_type in ['Int64', 'boolean']:
pytest.skip("Skipping test where data type is numpy and target type is nullable dtype")

Expand All @@ -1076,14 +1076,9 @@ def test_targets_pandas_data_types_classification(data_type, automl_type, target
y = y.map({unique_vals[i]: float(i) for i in range(len(unique_vals))})

y = y.astype(target_type)

if data_type == 'np':
X = X.to_numpy()
y = y.to_numpy()

elif data_type == 'ww':
X = ww.DataTable(X)
y = ww.DataColumn(y)
if data_type != 'pd':
X = make_data_type(data_type, X)
y = make_data_type(data_type, y)

automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_iterations=3, n_jobs=1)
automl.search()
Expand Down
13 changes: 13 additions & 0 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,19 @@ def test_all_estimators_check_fit(X_y_binary, test_estimator_needs_fitting_false
component.feature_importance


@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww'])
def test_all_transformers_check_fit_input_type(data_type, X_y_binary, make_data_type):
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
X, y = X_y_binary
X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
for component_class in _all_transformers():
if not component_class.needs_fitting:
continue

component = component_class()
component.fit(X, y)


def test_no_fitting_required_components(X_y_binary, test_estimator_needs_fitting_false, helper_functions):
X, y = X_y_binary
for component_class in all_components() + [test_estimator_needs_fitting_false]:
Expand Down
31 changes: 30 additions & 1 deletion evalml/tests/component_tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

import numpy as np
import pandas as pd
import pytest

from evalml.model_family import ModelFamily
from evalml.pipelines.components import Estimator
from evalml.pipelines.components.utils import _all_estimators_used_in_search
from evalml.pipelines.components.utils import (
_all_estimators_used_in_search,
get_estimators
)
from evalml.problem_types import ProblemTypes, handle_problem_types


Expand Down Expand Up @@ -56,3 +60,28 @@ class MockEstimator(Estimator):
mock_estimator.supported_problem_types = ['binary', 'multiclass']
assert mock_estimator != MockEstimator()
assert 'Mock Estimator' != mock_estimator


@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww'])
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
def test_all_estimators_check_fit_input_type(data_type, X_y_binary, make_data_type, helper_functions):
X, y = X_y_binary
X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
estimators_to_check = [estimator for estimator in get_estimators('binary')]
for component_class in estimators_to_check:
component = helper_functions.safe_init_component_with_njobs_1(component_class)
component.fit(X, y)
component.predict(X)
component.predict_proba(X)


@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww'])
def test_all_estimators_check_fit_input_type_regression(data_type, X_y_regression, make_data_type, helper_functions):
X, y = X_y_regression
X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
estimators_to_check = [estimator for estimator in get_estimators('regression')]
for component_class in estimators_to_check:
component = helper_functions.safe_init_component_with_njobs_1(component_class)
component.fit(X, y)
component.predict(X)
5 changes: 5 additions & 0 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,11 @@ def safe_init_pipeline_with_njobs_1(pipeline_class):
def make_data_type():
"""Helper function to convert numpy or pandas input to the appropriate type for tests."""
def _make_data_type(data_type, data):
if data_type == "li":
if isinstance(data, pd.DataFrame):
data = data.to_numpy()
data = data.tolist()
return data
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
if data_type != "np":
if len(data.shape) == 1:
data = pd.Series(data)
Expand Down
20 changes: 19 additions & 1 deletion evalml/tests/objective_tests/test_fraud_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@ def test_fraud_objective_function_amount_col(X_y_binary):
fraud_payout_percentage=.75,
amount_col="this column does not exist")
y_predicted = pd.Series([.1, .5, .5])
y_true = pd.Series([True, False, True])
y_true = [True, False, True]
with pytest.raises(ValueError, match="`this column does not exist` is not a valid column in X."):
objective.objective_function(y_true, y_predicted, X)

with pytest.raises(ValueError, match="`this column does not exist` is not a valid column in X."):
objective.objective_function(y_true, y_predicted, X.tolist())


def test_input_contains_nan(X_y_binary):
fraud_cost = FraudCost(amount_col="value")
Expand Down Expand Up @@ -139,3 +142,18 @@ def test_fraud_objective_score(X_y_binary):
pd.testing.assert_series_equal(out, expected_y_pred, check_names=False)
score = fraud_cost.score(y_true, out, extra_columns)
assert (score == 0.255)


def test_fraud_objective_score_list(X_y_binary):
X, y = X_y_binary
fraud_cost = FraudCost(amount_col="value")

y_predicted = [.1, .5, .5]
y_true = [True, False, True]
extra_columns = pd.DataFrame({"value": [100, 5, 250]})

out = fraud_cost.decision_function(y_predicted, 5, extra_columns)
assert isinstance(out, pd.Series)
pd.testing.assert_series_equal(out, pd.Series(y_true), check_names=False)
score = fraud_cost.score(y_true, out, extra_columns)
assert (score == 0.0)
13 changes: 4 additions & 9 deletions evalml/tests/pipeline_tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1597,11 +1597,11 @@ def test_get_default_parameters(logistic_regression_binary_pipeline_class):
assert logistic_regression_binary_pipeline_class.default_parameters == expected_defaults


@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww'])
@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww'])
@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS])
@pytest.mark.parametrize("target_type", ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'category', 'object', 'Int64', 'boolean'])
def test_targets_data_types_classification_pipelines(data_type, problem_type, target_type, all_binary_pipeline_classes,
all_multiclass_pipeline_classes, helper_functions):
make_data_type, all_multiclass_pipeline_classes, helper_functions):
if data_type == 'np' and target_type in ['Int64', 'boolean']:
pytest.skip("Skipping test where data type is numpy and target type is nullable dtype")

Expand Down Expand Up @@ -1633,13 +1633,8 @@ def test_targets_data_types_classification_pipelines(data_type, problem_type, ta
y = y.astype(target_type)
unique_vals = y.unique()

if data_type == 'np':
X = X.to_numpy()
y = y.to_numpy()

elif data_type == 'ww':
X = ww.DataTable(X)
y = ww.DataColumn(y)
X = make_data_type(data_type, X)
y = make_data_type(data_type, y)

for pipeline_class in pipeline_classes:
pipeline = helper_functions.safe_init_pipeline_with_njobs_1(pipeline_class)
Expand Down
2 changes: 1 addition & 1 deletion evalml/utils/gen_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def _rename_column_names_to_numeric(X):
Transformed X where column names are renamed to numerical values
"""
X_t = X
if isinstance(X, np.ndarray):
if isinstance(X, (np.ndarray, list)):
return pd.DataFrame(X)
if isinstance(X, ww.DataTable):
X_t = X.to_dataframe()
Expand Down