diff --git a/docs/source/automl/pipeline_search.ipynb b/docs/source/automl/pipeline_search.ipynb index e1b0e6e79d..845902ec96 100644 --- a/docs/source/automl/pipeline_search.ipynb +++ b/docs/source/automl/pipeline_search.ipynb @@ -20,7 +20,7 @@ "source": [ "## How it works\n", "\n", - "EvalML selects and tunes machine learning pipelines built of numerous steps. This includes missing value imputation, feature selection, feature scaling, and finally machine learning. As EvalML tunes pipelines, it uses the objective function selected and configured by the user to guide its search. \n", + "EvalML selects and tunes machine learning pipelines built of numerous steps. This includes encoding categorical data, missing value imputation, feature selection, feature scaling, and finally machine learning. As EvalML tunes pipelines, it uses the objective function selected and configured by the user to guide its search. \n", "\n", "\n", "At each iteration, EvalML uses cross-validation to generate an estimate of the pipeline's performances. If a pipeline has high variance across cross-validation folds, it will provide a warning. In this case, the pipeline may not perform reliably in the future.\n", diff --git a/docs/source/automl/regression_example.ipynb b/docs/source/automl/regression_example.ipynb index afe3fb1ebe..9c47f21cfc 100644 --- a/docs/source/automl/regression_example.ipynb +++ b/docs/source/automl/regression_example.ipynb @@ -245,7 +245,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb index 407f7cdec3..9ccc9b627b 100644 --- a/docs/source/automl/search_results.ipynb +++ b/docs/source/automl/search_results.ipynb @@ -811,7 +811,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/demos/fraud.ipynb b/docs/source/demos/fraud.ipynb index 4332065931..28fd8517e7 100644 --- a/docs/source/demos/fraud.ipynb +++ b/docs/source/demos/fraud.ipynb @@ -90,16 +90,38 @@ "X, y = evalml.demos.load_fraud()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "EvalML natively supports one-hot encoding. Here we keep 1 out of the 6 categorical columns to decrease computation time." + ] + }, { "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "card_id int64\n", + "store_id int64\n", + "amount int64\n", + "currency object\n", + "customer_present bool\n", + "lat float64\n", + "lng float64\n", + "dtype: object\n" + ] + } + ], "source": [ - "# select numeric data before running AutoClassifer\n", - "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']\n", - "X = X.select_dtypes(include=numerics)\n", - "X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, test_size=.8, random_state=0)" + "X = X.drop(['datetime', 'expiration_date', 'country', 'region', 'provider'], axis=1)\n", + "X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, test_size=0.2, random_state=0)\n", + "\n", + "print(X.dtypes)" ] }, { @@ -124,11 +146,11 @@ "\n", "Optimizing for Fraud Cost. Lower score is better.\n", "\n", - "Searching up to 10 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", "\n", "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "Testing XGBoost w/ imputation: 100%|██████████| 10/10 [01:05<00:00, 6.53s/it] \n", + "Testing LogisticRegression w/ imputation + scaling: 100%|██████████| 5/5 [18:10<00:00, 218.05s/it]\n", "\n", "✔ Optimization finished\n" ] @@ -136,7 +158,8 @@ ], "source": [ "clf = evalml.AutoClassifier(objective=fraud_objective,\n", - " max_pipelines=10)\n", + " additional_objectives=['auc', 'recall', 'precision'],\n", + " max_pipelines=5)\n", "\n", "clf.fit(X_train, y_train)" ] @@ -185,43 +208,63 @@ " \n", " \n", " \n", - " 0\n", - " 9\n", + " 0\n", + " 1\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007623\n", + " False\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", + " \n", + " \n", + " 1\n", + " 4\n", + " LogisticRegressionPipeline\n", + " 0.007623\n", " False\n", - " {'eta': 0.6481718720511973, 'min_child_weight'...\n", + " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", " \n", - " 1\n", " 2\n", + " 0\n", " XGBoostPipeline\n", - " 0.007614\n", + " 0.007623\n", " False\n", " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", - " 2\n", - " 8\n", + " 3\n", + " 3\n", + " XGBoostPipeline\n", + " 0.007623\n", + " False\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", + " \n", + " \n", + " 4\n", + " 2\n", " RFClassificationPipeline\n", - " 0.007614\n", + " 0.007623\n", " False\n", - " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", + " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id pipeline_name score high_variance_cv \\\n", - "0 9 XGBoostPipeline 0.007614 False \n", - "1 2 XGBoostPipeline 0.007614 False \n", - "2 8 RFClassificationPipeline 0.007614 False \n", + " id pipeline_name score high_variance_cv \\\n", + "0 1 XGBoostPipeline 0.007623 False \n", + "1 4 LogisticRegressionPipeline 0.007623 False \n", + "2 0 XGBoostPipeline 0.007623 False \n", + "3 3 XGBoostPipeline 0.007623 False \n", + "4 2 RFClassificationPipeline 0.007623 False \n", "\n", " parameters \n", - "0 {'eta': 0.6481718720511973, 'min_child_weight'... \n", - "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "2 {'n_estimators': 369, 'max_depth': 10, 'impute... " + "0 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "1 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", + "2 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "4 {'n_estimators': 569, 'max_depth': 22, 'impute... " ] }, "execution_count": 6, @@ -230,7 +273,7 @@ } ], "source": [ - "clf.rankings.head(3)" + "clf.rankings" ] }, { @@ -274,25 +317,25 @@ "Pipeline Name: XGBoost w/ imputation\n", "Model type: xgboost\n", "Objective: Fraud Cost (lower is better)\n", - "Total training time (including CV): 6.1 seconds\n", + "Total training time (including CV): 383.7 seconds\n", "\n", "Parameters\n", "==========\n", - "• eta: 0.6481718720511973\n", - "• min_child_weight: 4.314173858564932\n", - "• max_depth: 6\n", - "• impute_strategy: most_frequent\n", - "• percent_features: 0.871312026764351\n", + "• eta: 0.38438170729269994\n", + "• min_child_weight: 3.677811458900251\n", + "• max_depth: 13\n", + "• impute_strategy: median\n", + "• percent_features: 0.793807787701838\n", "\n", "Cross Validation\n", "=================\n", - " F1 Precision Recall AUC Log Loss Fraud Cost # Training # Testing\n", - "0 0.264 0.152 0.264 0.841 0.192 0.008 13332.000 6666.000\n", - "1 0.264 0.152 0.264 0.845 0.191 0.008 13332.000 6666.000\n", - "2 0.264 0.152 0.264 0.834 0.202 0.008 13332.000 6666.000\n", - "mean 0.264 0.152 0.264 0.840 0.195 0.008 - -\n", - "std 0.000 0.000 0.000 0.006 0.006 0.000 - -\n", - "coef of var 0.000 0.000 0.000 0.007 0.029 0.002 - -\n" + " AUC Recall Precision Fraud Cost # Training # Testing\n", + "0 0.831 0.264 0.152 0.008 53328.000 26665.000\n", + "1 0.833 0.264 0.152 0.008 53328.000 26665.000\n", + "2 0.836 0.264 0.152 0.008 53330.000 26663.000\n", + "mean 0.834 0.264 0.152 0.008 - -\n", + "std 0.003 0.000 0.000 0.000 - -\n", + "coef of var 0.003 0.000 0.000 0.003 - -\n" ] } ], @@ -317,7 +360,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -344,8 +387,8 @@ { "data": { "text/plain": [ - "(0.007625590107402798,\n", - " {'AUC': 0.8497570761248703, 'Fraud Cost': 0.007625590107402798})" + "(0.007626457064581641,\n", + " {'AUC': 0.8336438887334185, 'Fraud Cost': 0.007626457064581641})" ] }, "execution_count": 10, @@ -381,11 +424,11 @@ "\n", "Optimizing for AUC. Greater score is better.\n", "\n", - "Searching up to 10 pipelines. No time limit is set. Set one using max_time parameter.\n", + "Searching up to 5 pipelines. No time limit is set. Set one using max_time parameter.\n", "\n", "Possible model types: xgboost, linear_model, random_forest\n", "\n", - "Testing XGBoost w/ imputation: 100%|██████████| 10/10 [01:29<00:00, 8.96s/it] \n", + "Testing LogisticRegression w/ imputation + scaling: 100%|██████████| 5/5 [19:16<00:00, 231.26s/it]\n", "\n", "✔ Optimization finished\n" ] @@ -393,7 +436,8 @@ ], "source": [ "clf_auc = evalml.AutoClassifier(objective='auc',\n", - " max_pipelines=10)\n", + " additional_objectives=['recall', 'precision'],\n", + " max_pipelines=5)\n", "\n", "clf_auc.fit(X_train, y_train)" ] @@ -440,113 +484,63 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", + " 0\n", + " 2\n", " RFClassificationPipeline\n", - " 0.863238\n", + " 0.873053\n", " False\n", " {'n_estimators': 569, 'max_depth': 22, 'impute...\n", " \n", " \n", - " 1\n", - " 9\n", + " 1\n", + " 0\n", " XGBoostPipeline\n", - " 0.852239\n", + " 0.849826\n", " False\n", - " {'eta': 0.38438170729269994, 'min_child_weight...\n", - " \n", - " \n", - " 2\n", - " 3\n", - " RFClassificationPipeline\n", - " 0.847514\n", - " False\n", - " {'n_estimators': 369, 'max_depth': 10, 'impute...\n", - " \n", - " \n", - " 3\n", - " 8\n", - " RFClassificationPipeline\n", - " 0.846346\n", - " False\n", - " {'n_estimators': 715, 'max_depth': 7, 'impute_...\n", + " {'eta': 0.5928446182250184, 'min_child_weight'...\n", " \n", " \n", - " 4\n", " 2\n", + " 1\n", " XGBoostPipeline\n", - " 0.845902\n", + " 0.840634\n", " False\n", - " {'eta': 0.5928446182250184, 'min_child_weight'...\n", + " {'eta': 0.38438170729269994, 'min_child_weight...\n", " \n", " \n", - " 5\n", - " 5\n", - " RFClassificationPipeline\n", - " 0.842745\n", + " 3\n", + " 3\n", + " XGBoostPipeline\n", + " 0.839091\n", " False\n", - " {'n_estimators': 609, 'max_depth': 7, 'impute_...\n", + " {'eta': 0.5288949197529046, 'min_child_weight'...\n", " \n", " \n", - " 6\n", + " 4\n", " 4\n", " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 6.239401330891865, 'imp...\n", - " \n", - " \n", - " 7\n", - " 0\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", + " 0.831181\n", " False\n", " {'penalty': 'l2', 'C': 8.444214828324364, 'imp...\n", " \n", - " \n", - " 8\n", - " 7\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 8.123565600467177, 'imp...\n", - " \n", - " \n", - " 9\n", - " 6\n", - " LogisticRegressionPipeline\n", - " 0.838806\n", - " False\n", - " {'penalty': 'l2', 'C': 0.5765626434012575, 'im...\n", - " \n", " \n", "\n", "" ], "text/plain": [ " id pipeline_name score high_variance_cv \\\n", - "0 1 RFClassificationPipeline 0.863238 False \n", - "1 9 XGBoostPipeline 0.852239 False \n", - "2 3 RFClassificationPipeline 0.847514 False \n", - "3 8 RFClassificationPipeline 0.846346 False \n", - "4 2 XGBoostPipeline 0.845902 False \n", - "5 5 RFClassificationPipeline 0.842745 False \n", - "6 4 LogisticRegressionPipeline 0.838806 False \n", - "7 0 LogisticRegressionPipeline 0.838806 False \n", - "8 7 LogisticRegressionPipeline 0.838806 False \n", - "9 6 LogisticRegressionPipeline 0.838806 False \n", + "0 2 RFClassificationPipeline 0.873053 False \n", + "1 0 XGBoostPipeline 0.849826 False \n", + "2 1 XGBoostPipeline 0.840634 False \n", + "3 3 XGBoostPipeline 0.839091 False \n", + "4 4 LogisticRegressionPipeline 0.831181 False \n", "\n", " parameters \n", "0 {'n_estimators': 569, 'max_depth': 22, 'impute... \n", - "1 {'eta': 0.38438170729269994, 'min_child_weight... \n", - "2 {'n_estimators': 369, 'max_depth': 10, 'impute... \n", - "3 {'n_estimators': 715, 'max_depth': 7, 'impute_... \n", - "4 {'eta': 0.5928446182250184, 'min_child_weight'... \n", - "5 {'n_estimators': 609, 'max_depth': 7, 'impute_... \n", - "6 {'penalty': 'l2', 'C': 6.239401330891865, 'imp... \n", - "7 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... \n", - "8 {'penalty': 'l2', 'C': 8.123565600467177, 'imp... \n", - "9 {'penalty': 'l2', 'C': 0.5765626434012575, 'im... " + "1 {'eta': 0.5928446182250184, 'min_child_weight'... \n", + "2 {'eta': 0.38438170729269994, 'min_child_weight... \n", + "3 {'eta': 0.5288949197529046, 'min_child_weight'... \n", + "4 {'penalty': 'l2', 'C': 8.444214828324364, 'imp... " ] }, "execution_count": 12, @@ -566,7 +560,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 13, @@ -589,8 +583,8 @@ { "data": { "text/plain": [ - "(0.8619958322554153,\n", - " {'AUC': 0.8619958322554153, 'Fraud Cost': 0.03432590219090485})" + "(0.8745605699827037,\n", + " {'AUC': 0.8745605699827037, 'Fraud Cost': 0.03273490785793763})" ] }, "execution_count": 14, @@ -629,7 +623,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/docs/source/roadmap.rst b/docs/source/roadmap.rst index 69bd353677..29c6e48889 100644 --- a/docs/source/roadmap.rst +++ b/docs/source/roadmap.rst @@ -11,5 +11,4 @@ There are numerous new features and functionality planned for EvalML, some of wh * Ability to warm start from a previous pipeline search * Instructions for adding your own modeling pipelines for EvalML to tune * Add additional hyperparameter tuning methods -* Handle categorical data natively within EvalML * Visualizations for understanding model search diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index 44dc586303..dc5523ebd8 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd from colorama import Style -from pandas.api.types import is_numeric_dtype from tqdm import tqdm from evalml import preprocessing @@ -101,10 +100,6 @@ def fit(self, X, y, feature_types=None, raise_errors=False): if not isinstance(y, pd.Series): y = pd.Series(y) - for col in X.columns: - if not is_numeric_dtype(X[col]): - raise ValueError("Input column '{}' contains non-numerical data".format(col)) - self._log_title("Beginning pipeline search") self._log("Optimizing for %s. " % self.objective.name, new_line=False) @@ -169,7 +164,6 @@ def _do_iteration(self, X, y, pbar, raise_errors): X_train, X_test = X.iloc[train], X.iloc[test] else: X_train, X_test = X[train], X[test] - if isinstance(y, pd.Series): y_train, y_test = y.iloc[train], y.iloc[test] else: diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py index e064a33030..d77484fbb5 100644 --- a/evalml/pipelines/classification/logistic_regression.py +++ b/evalml/pipelines/classification/logistic_regression.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.impute import SimpleImputer @@ -25,6 +26,7 @@ class LogisticRegressionPipeline(PipelineBase): def __init__(self, objective, penalty, C, impute_strategy, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = LogisticRegression(random_state=random_state, penalty=penalty, @@ -34,7 +36,8 @@ def __init__(self, objective, penalty, C, impute_strategy, n_jobs=-1) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("scaler", StandardScaler()), ("estimator", estimator)] ) diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py index 06ea8cb1b1..4f56eb13a2 100644 --- a/evalml/pipelines/classification/random_forest.py +++ b/evalml/pipelines/classification/random_forest.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier @@ -23,9 +24,10 @@ class RFClassificationPipeline(PipelineBase): "percent_features": Real(.01, 1) } - def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features, - number_features, n_jobs=1, random_state=0): + def __init__(self, objective, n_estimators, max_depth, impute_strategy, + percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators, @@ -39,7 +41,8 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), ("estimator", estimator)] ) diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index ae2bc63afa..242fd0ea73 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.feature_selection import SelectFromModel @@ -24,9 +25,10 @@ class XGBoostPipeline(PipelineBase): "percent_features": Real(.01, 1) } - def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, percent_features, - number_features, n_jobs=1, random_state=0): + def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, + percent_features, number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = XGBClassifier( random_state=random_state, @@ -42,7 +44,8 @@ def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), ("estimator", estimator)] ) diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index d2dac014be..2170a04e26 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -31,11 +31,11 @@ def fit(self, X, y, objective_fit_size=.2): if not isinstance(y, pd.Series): y = pd.Series(y) - self.input_feature_names = X.columns.tolist() if self.objective.needs_fitting: X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size, random_state=self.random_state) self.pipeline.fit(X, y) + self.input_feature_names = self.pipeline['encoder'].feature_names if self.objective.needs_fitting: if self.objective.fit_needs_proba: diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py index ad93a8756c..4a46c3ac1f 100644 --- a/evalml/pipelines/regression/random_forest.py +++ b/evalml/pipelines/regression/random_forest.py @@ -1,3 +1,4 @@ +import category_encoders as ce import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor @@ -27,6 +28,7 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ number_features, n_jobs=1, random_state=0): imputer = SimpleImputer(strategy=impute_strategy) + enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) estimator = RandomForestRegressor(random_state=random_state, n_estimators=n_estimators, @@ -40,9 +42,10 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_ ) self.pipeline = Pipeline( - [("imputer", imputer), + [("encoder", enc), + ("imputer", imputer), ("feature_selection", feature_selection), - ("estimator", estimator)] + ("estimator", estimator)], ) super().__init__(objective=objective, random_state=random_state) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 84eacec411..29740382c0 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -25,6 +25,17 @@ def X_y_categorical_regression(): flights = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv') y = flights['tip'] X = flights.drop('tip', axis=1) + + # add categorical dtype + X['smoker'] = X['smoker'].astype('category') + return X, y + + +@pytest.fixture +def X_y_categorical_classification(): + titanic = pd.read_csv('https://featuretools-static.s3.amazonaws.com/evalml/Titanic/train.csv') + y = titanic['Survived'] + X = titanic.drop('Survived', axis=1) return X, y diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py index be95e58500..8e3db40b43 100644 --- a/evalml/tests/test_autoclassifier.py +++ b/evalml/tests/test_autoclassifier.py @@ -119,6 +119,14 @@ def test_multi_auto(X_y_multi): assert clf.additional_objectives == get_objectives('multiclass') +def test_categorical_classification(X_y_categorical_classification): + X, y = X_y_categorical_classification + clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False) + clf.fit(X, y) + assert not clf.rankings['score'].isnull().all() + assert not clf.get_pipeline(0).feature_importances.isnull().all().all() + + def test_random_state(X_y): X, y = X_y diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py index c1d86b5d75..738a2f8db8 100644 --- a/evalml/tests/test_autoregressor.py +++ b/evalml/tests/test_autoregressor.py @@ -51,12 +51,12 @@ def test_random_state(X_y): assert pd.testing.assert_frame_equal(clf.rankings, clf_1.rankings) is None -def test_categorical(X_y_categorical_regression): +def test_categorical_regression(X_y_categorical_regression): X, y = X_y_categorical_regression clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0) - error_msg = 'contains non-numerical data' - with pytest.raises(ValueError, match=error_msg): - clf.fit(X, y, raise_errors=True) + clf.fit(X, y) + assert not clf.rankings['score'].isnull().all() + assert not clf.get_pipeline(0).feature_importances.isnull().all().all() def test_callback(X_y): diff --git a/requirements.txt b/requirements.txt index cf43ddcd84..e15bb34e8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ scikit-optimize[plots] colorama s3fs==0.2.2 joblib>=0.10.3 +category_encoders>=2.0.0