alteryx · jeremyliweishih · Sep 18, 2019 · Sep 13, 2019 · Sep 13, 2019 · Sep 13, 2019
diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pandas as pd
 from colorama import Style
-from pandas.api.types import is_numeric_dtype
 from tqdm import tqdm
 
 from evalml import preprocessing
@@ -97,10 +96,6 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
         if not isinstance(y, pd.Series):
             y = pd.Series(y)
 
-        for col in X.columns:
-            if not is_numeric_dtype(X[col]):
-                raise ValueError("Input column '{}' contains non-numerical data".format(col))
-
         self._log_title("Beginning pipeline search")
         self._log("Optimizing for %s. " % self.objective.name, new_line=False)
 

diff --git a/evalml/pipelines/classification/logistic_regression.py b/evalml/pipelines/classification/logistic_regression.py
@@ -1,3 +1,4 @@
+import category_encoders as ce
 import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer
@@ -20,11 +21,13 @@ class LogisticRegressionPipeline(PipelineBase):
         "penalty": ["l2"],
         "C": Real(.01, 10),
         "impute_strategy": ["mean", "median", "most_frequent"],
+        "drop_invariant": [True, False]
     }
 
-    def __init__(self, objective, penalty, C, impute_strategy,
+    def __init__(self, objective, penalty, C, impute_strategy, drop_invariant,
                  number_features, n_jobs=1, random_state=0):
         imputer = SimpleImputer(strategy=impute_strategy)
+        enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True)
 
         estimator = LogisticRegression(random_state=random_state,
                                        penalty=penalty,
@@ -34,7 +37,8 @@ def __init__(self, objective, penalty, C, impute_strategy,
                                        n_jobs=-1)
 
         self.pipeline = Pipeline(
-            [("imputer", imputer),
+            [("encoder", enc),
+             ("imputer", imputer),
              ("scaler", StandardScaler()),
              ("estimator", estimator)]
         )

diff --git a/evalml/pipelines/classification/random_forest.py b/evalml/pipelines/classification/random_forest.py
@@ -1,3 +1,4 @@
+import category_encoders as ce
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
@@ -20,12 +21,14 @@ class RFClassificationPipeline(PipelineBase):
         "n_estimators": Integer(10, 1000),
         "max_depth": Integer(1, 32),
         "impute_strategy": ["mean", "median", "most_frequent"],
-        "percent_features": Real(.01, 1)
+        "percent_features": Real(.01, 1),
+        "drop_invariant": [True, False]
     }
 
-    def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features,
-                 number_features, n_jobs=1, random_state=0):
+    def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant,
+                 percent_features, number_features, n_jobs=1, random_state=0):
         imputer = SimpleImputer(strategy=impute_strategy)
+        enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True)
 
         estimator = RandomForestClassifier(random_state=random_state,
                                            n_estimators=n_estimators,
@@ -39,7 +42,8 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_
         )
 
         self.pipeline = Pipeline(
-            [("imputer", imputer),
+            [("encoder", enc),
+             ("imputer", imputer),
              ("feature_selection", feature_selection),
              ("estimator", estimator)]
         )

diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py
@@ -1,3 +1,4 @@
+import category_encoders as ce
 import numpy as np
 import pandas as pd
 from sklearn.feature_selection import SelectFromModel
@@ -21,12 +22,14 @@ class XGBoostPipeline(PipelineBase):
         "min_child_weight": Real(1, 10),
         "max_depth": Integer(1, 20),
         "impute_strategy": ["mean", "median", "most_frequent"],
+        "drop_invariant": [True, False],
         "percent_features": Real(.01, 1)
     }
 
-    def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, percent_features,
-                 number_features, n_jobs=1, random_state=0):
+    def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy, drop_invariant,
+                 percent_features, number_features, n_jobs=1, random_state=0):
         imputer = SimpleImputer(strategy=impute_strategy)
+        enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True)
 
         estimator = XGBClassifier(
             random_state=random_state,
@@ -42,7 +45,8 @@ def __init__(self, objective, eta, min_child_weight, max_depth, impute_strategy,
         )
 
         self.pipeline = Pipeline(
-            [("imputer", imputer),
+            [("encoder", enc),
+             ("imputer", imputer),
              ("feature_selection", feature_selection),
              ("estimator", estimator)]
         )

diff --git a/evalml/pipelines/regression/random_forest.py b/evalml/pipelines/regression/random_forest.py
@@ -1,3 +1,4 @@
+import category_encoders as ce
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import RandomForestRegressor
@@ -20,13 +21,15 @@ class RFRegressionPipeline(PipelineBase):
         "n_estimators": Integer(10, 1000),
         "max_depth": Integer(1, 32),
         "impute_strategy": ["mean", "median", "most_frequent"],
+        "drop_invariant": [True, False],
         "percent_features": Real(.01, 1)
     }
 
-    def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_features,
+    def __init__(self, objective, n_estimators, max_depth, impute_strategy, drop_invariant, percent_features,
                  number_features, n_jobs=1, random_state=0):
 
         imputer = SimpleImputer(strategy=impute_strategy)
+        enc = ce.OneHotEncoder(drop_invariant=drop_invariant, return_df=True)
 
         estimator = RandomForestRegressor(random_state=random_state,
                                           n_estimators=n_estimators,
@@ -40,9 +43,10 @@ def __init__(self, objective, n_estimators, max_depth, impute_strategy, percent_
         )
 
         self.pipeline = Pipeline(
-            [("imputer", imputer),
+            [("encoder", enc),
+             ("imputer", imputer),
              ("feature_selection", feature_selection),
-             ("estimator", estimator)]
+             ("estimator", estimator)],
         )
 
         super().__init__(objective=objective, random_state=random_state)

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -28,6 +28,14 @@ def X_y_categorical_regression():
     return X, y
 
 
+@pytest.fixture
+def X_y_categorical_classification():
+    titanic = pd.read_csv('https://featuretools-static.s3.amazonaws.com/evalml/Titanic/train.csv')
+    y = titanic['Survived']
+    X = titanic.drop('Survived', axis=1)
+    return X, y
+
+
 @pytest.fixture
 def trained_model(X_y):
     X, y = X_y

diff --git a/evalml/tests/test_autoclassifier.py b/evalml/tests/test_autoclassifier.py
@@ -119,6 +119,13 @@ def test_multi_auto(X_y_multi):
     assert clf.default_objectives == get_objectives('multiclass')
 
 
+def test_categorical_auto(X_y_categorical_classification):
+    X, y = X_y_categorical_classification
+    clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False)
+    clf.fit(X.values, y)
+    assert not clf.rankings['score'].isnull().all()
+
+
 def test_random_state(X_y):
     X, y = X_y
 

diff --git a/evalml/tests/test_autoregressor.py b/evalml/tests/test_autoregressor.py
@@ -54,9 +54,8 @@ def test_random_state(X_y):
 def test_categorical(X_y_categorical_regression):
     X, y = X_y_categorical_regression
     clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0)
-    error_msg = 'contains non-numerical data'
-    with pytest.raises(ValueError, match=error_msg):
-        clf.fit(X, y, raise_errors=True)
+    clf.fit(X.values, y)
+    assert not clf.rankings['score'].isnull().all()
 
 
 def test_callback(X_y):

diff --git a/evalml/tests/test_logistic_regression.py b/evalml/tests/test_logistic_regression.py
@@ -7,7 +7,7 @@
 def test_lr_multi(X_y_multi):
     X, y = X_y_multi
     objective = PrecisionMicro()
-    clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]))
+    clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]))
     clf.fit(X, y)
     clf.score(X, y)
     y_pred = clf.predict(X)

diff --git a/evalml/tests/test_objectives.py b/evalml/tests/test_objectives.py
@@ -26,7 +26,7 @@ def test_binary_average(X_y):
     X = pd.DataFrame(X)
     y = pd.Series(y)
 
-    pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', number_features=0)
+    pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=0)
     pipeline.fit(X, y)
     y_pred = pipeline.predict(X)
 

diff --git a/evalml/tests/test_pipelines.py b/evalml/tests/test_pipelines.py
@@ -43,7 +43,7 @@ def test_serialization(X_y, trained_model, path_management):
     path = os.path.join(path_management, 'pipe.pkl')
     objective = Precision()
 
-    pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]))
+    pipeline = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]))
     pipeline.fit(X, y)
     save_pipeline(pipeline, path)
     assert pipeline.score(X, y) == load_pipeline(path).score(X, y)
@@ -60,10 +60,10 @@ def test_reproducibility(X_y):
         amount_col=10
     )
 
-    clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
+    clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0)
     clf.fit(X, y)
 
-    clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
+    clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', drop_invariant=False, number_features=len(X[0]), random_state=0)
     clf_1.fit(X, y)
 
     assert clf_1.score(X, y) == clf.score(X, y)
diff --git a/evalml/tests/test_rf.py b/evalml/tests/test_rf.py
@@ -7,7 +7,7 @@
 def test_rf_multi(X_y_multi):
     X, y = X_y_multi
     objective = PrecisionMicro()
-    clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0]))
+    clf = RFClassificationPipeline(objective=objective, n_estimators=10, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0]))
     clf.fit(X, y)
     clf.score(X, y)
     y_pred = clf.predict(X)

diff --git a/evalml/tests/test_xgboost.py b/evalml/tests/test_xgboost.py
@@ -7,7 +7,7 @@
 def test_xg_multi(X_y_multi):
     X, y = X_y_multi
     objective = PrecisionMicro()
-    clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0]))
+    clf = XGBoostPipeline(objective=objective, eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', drop_invariant=False, percent_features=1.0, number_features=len(X[0]))
     clf.fit(X, y)
     clf.score(X, y)
     y_pred = clf.predict(X)

diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ scikit-optimize[plots]
 colorama
 s3fs==0.2.2
 joblib>=0.10.3
+category_encoders