alteryx · christopherbunn · Nov 1, 2019 · Sep 13, 2019 · Sep 13, 2019 · Sep 13, 2019
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,8 @@ licenses/
 __pycache__/
 *.py[cod]
 *$py.class
+**/.DS_Store
+.DS_Store
 
 # C extensions
 *.so

diff --git a/docs/source/automl/regression_example.ipynb b/docs/source/automl/regression_example.ipynb
@@ -26,12 +26,12 @@
       "\n",
       "Possible model types: random_forest\n",
       "\n",
-      "✔ Random Forest w/ imputation:               0%|          | Elapsed:00:06\n",
-      "✔ Random Forest w/ imputation:              20%|██        | Elapsed:00:11\n",
-      "✔ Random Forest w/ imputation:              40%|████      | Elapsed:00:18\n",
-      "✔ Random Forest w/ imputation:              60%|██████    | Elapsed:00:24\n",
-      "✔ Random Forest w/ imputation:              80%|████████  | Elapsed:00:31\n",
-      "✔ Random Forest w/ imputation:             100%|██████████| Elapsed:00:31\n",
+      "✔ Random Forest w/ imputation:               0%|          | Elapsed:00:05\n",
+      "✔ Random Forest w/ imputation:              20%|██        | Elapsed:00:10\n",
+      "✔ Random Forest w/ imputation:              40%|████      | Elapsed:00:16\n",
+      "✔ Random Forest w/ imputation:              60%|██████    | Elapsed:00:22\n",
+      "✔ Random Forest w/ imputation:              80%|████████  | Elapsed:00:30\n",
+      "✔ Random Forest w/ imputation:             100%|██████████| Elapsed:00:30\n",
       "\n",
       "✔ Optimization finished\n"
      ]
@@ -45,7 +45,7 @@
     "\n",
     "X, y = evalml.demos.load_diabetes()\n",
     "\n",
-    "clf = evalml.AutoRegressor(objective=\"R2\")\n",
+    "clf = evalml.AutoRegressor(objective=\"R2\", max_pipelines=5)\n",
     "\n",
     "clf.fit(X, y)"
    ]
@@ -161,7 +161,7 @@
     {
      "data": {
       "text/plain": [
-       "<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x1306d7fd0>"
+       "<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
       ]
      },
      "execution_count": 3,
@@ -181,7 +181,7 @@
     {
      "data": {
       "text/plain": [
-       "<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x1306d7fd0>"
+       "<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
       ]
      },
      "execution_count": 4,
@@ -209,7 +209,7 @@
       "Pipeline Name: Random Forest w/ imputation\n",
       "Model type: ModelTypes.RANDOM_FOREST\n",
       "Objective: R2 (greater is better)\n",
-      "Total training time (including CV): 6.6 seconds\n",
+      "Total training time (including CV): 5.8 seconds\n",
       "\n",
       "Parameters\n",
       "==========\n",

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -18,6 +18,7 @@ Changelog
         * Added support for other units in max_time :pr:`125`
         * Detect highly null columns :pr:`121`
         * Added additional regression objectives :pr:`100`
+        * Added support for unlimited pipelines with a max_time limit :pr:`70`
     * Fixes
         * Reordered `describe_pipeline` :pr:`94`
         * Added type check for model_type :pr:`109`

diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py
@@ -106,11 +106,15 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
         else:
             self.logger.log("Lower score is better.\n")
 
-        self.logger.log("Searching up to %s pipelines. " % self.max_pipelines, new_line=False)
+        # Set default max_pipeline if none specified
+        if self.max_pipelines is None and self.max_time is None:
+            self.max_pipelines = 5
+            self.logger.log("No search limit is set. Set using max_time or max_pipelines.\n")
+
+        if self.max_pipelines:
+            self.logger.log("Searching up to %s pipelines. " % self.max_pipelines)
         if self.max_time:
             self.logger.log("Will stop searching for new pipelines after %d seconds.\n" % self.max_time)
-        else:
-            self.logger.log("No time limit is set. Set one using max_time parameter.\n")
         self.logger.log("Possible model types: %s\n" % ", ".join([model.value for model in self.possible_model_types]))
 
         if self.detect_label_leakage:
@@ -124,17 +128,25 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
             if len(highly_null_columns) > 0:
                 self.logger.log("WARNING: {} columns are at least {}% null.".format(', '.join(highly_null_columns), self.null_threshold * 100))
 
-        pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc}   {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
-        start = time.time()
-        for n in pbar:
-            elapsed = time.time() - start
-            if self.max_time and elapsed > self.max_time:
-                self.logger.log("\n\nMax time elapsed. Stopping search early.")
-                break
-            self._do_iteration(X, y, pbar, raise_errors)
-
-        pbar.close()
-
+        if self.max_pipelines is None:
+            start = time.time()
+            pbar = tqdm(total=self.max_time, disable=not self.verbose, file=stdout, bar_format='{desc} |    Elapsed:{elapsed}')
+            pbar._instances.clear()
+            while time.time() - start <= self.max_time:
+                self._do_iteration(X, y, pbar, raise_errors)
+            pbar.close()
+        else:
+            pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc}   {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
+            pbar._instances.clear()
+            start = time.time()
+            for n in pbar:
+                elapsed = time.time() - start
+                if self.max_time and elapsed > self.max_time:
+                    pbar.close()
+                    self.logger.log("\n\nMax time elapsed. Stopping search early.")
+                    break
+                self._do_iteration(X, y, pbar, raise_errors)
+            pbar.close()
         self.logger.log("\n✔ Optimization finished")
 
     def check_multiclass(self, y):
@@ -191,7 +203,8 @@ def _do_iteration(self, X, y, pbar, raise_errors):
             except Exception as e:
                 if raise_errors:
                     raise e
-                pbar.write(str(e))
+                if pbar:
+                    pbar.write(str(e))
                 score = np.nan
                 other_scores = OrderedDict(zip([n.name for n in self.additional_objectives], [np.nan] * len(self.additional_objectives)))
 

diff --git a/evalml/models/auto_classifier.py b/evalml/models/auto_classifier.py
@@ -13,7 +13,7 @@ class AutoClassifier(AutoBase):
     def __init__(self,
                  objective=None,
                  multiclass=False,
-                 max_pipelines=5,
+                 max_pipelines=None,
                  max_time=None,
                  model_types=None,
                  cv=None,

diff --git a/evalml/models/auto_regressor.py b/evalml/models/auto_regressor.py
@@ -10,7 +10,7 @@ class AutoRegressor(AutoBase):
 
     def __init__(self,
                  objective=None,
-                 max_pipelines=5,
+                 max_pipelines=None,
                  max_time=None,
                  model_types=None,
                  cv=None,

diff --git a/evalml/tests/automl_tests/test_autoclassifier.py b/evalml/tests/automl_tests/test_autoclassifier.py
@@ -19,7 +19,7 @@
 def test_init(X_y):
     X, y = X_y
 
-    clf = AutoClassifier(multiclass=False)
+    clf = AutoClassifier(multiclass=False, max_pipelines=1)
 
     # check loads all pipelines
     assert get_pipelines(problem_type=ProblemTypes.BINARY) == clf.possible_pipelines
@@ -74,7 +74,7 @@ def test_init_select_model_types():
 
 def test_max_pipelines(X_y):
     X, y = X_y
-    max_pipelines = 6
+    max_pipelines = 5
     clf = AutoClassifier(max_pipelines=max_pipelines)
 
     clf.fit(X, y)
@@ -84,7 +84,7 @@ def test_max_pipelines(X_y):
 
 def test_best_pipeline(X_y):
     X, y = X_y
-    max_pipelines = 3
+    max_pipelines = 5
     clf = AutoClassifier(max_pipelines=max_pipelines)
 
     clf.fit(X, y)
@@ -100,7 +100,7 @@ def test_specify_objective(X_y):
 
 def test_binary_auto(X_y):
     X, y = X_y
-    clf = AutoClassifier(objective="recall", multiclass=False)
+    clf = AutoClassifier(objective="recall", multiclass=False, max_pipelines=5)
     clf.fit(X, y)
     y_pred = clf.best_pipeline.predict(X)
     assert len(np.unique(y_pred)) == 2
@@ -117,13 +117,13 @@ def test_multi_error(X_y_multi):
 
 def test_multi_auto(X_y_multi):
     X, y = X_y_multi
-    clf = AutoClassifier(objective="recall_micro", multiclass=True)
+    clf = AutoClassifier(objective="recall_micro", multiclass=True, max_pipelines=5)
     clf.fit(X, y)
     y_pred = clf.best_pipeline.predict(X)
     assert len(np.unique(y_pred)) == 3
 
     objective = PrecisionMicro()
-    clf = AutoClassifier(objective=objective, multiclass=True)
+    clf = AutoClassifier(objective=objective, multiclass=True, max_pipelines=5)
     clf.fit(X, y)
     y_pred = clf.best_pipeline.predict(X)
     assert len(np.unique(y_pred)) == 3

diff --git a/evalml/tests/test_autobase.py b/evalml/tests/test_autobase.py
@@ -0,0 +1,26 @@
+from evalml import AutoClassifier
+
+
+def test_pipeline_limits(capsys, X_y):
+    X, y = X_y
+
+    clf = AutoClassifier(multiclass=False, max_pipelines=1)
+    clf.fit(X, y)
+    out, err = capsys.readouterr()
+    assert "Searching up to 1 pipelines. " in out
+
+    clf = AutoClassifier(multiclass=False, max_time=1)
+    clf.fit(X, y)
+    out, err = capsys.readouterr()
+    assert "Will stop searching for new pipelines after 1 seconds" in out
+
+    clf = AutoClassifier(multiclass=False, max_time=1, max_pipelines=5)
+    clf.fit(X, y)
+    out, err = capsys.readouterr()
+    assert "Searching up to 5 pipelines. " in out
+    assert "Will stop searching for new pipelines after 1 seconds" in out
+
+    clf = AutoClassifier(multiclass=False)
+    clf.fit(X, y)
+    out, err = capsys.readouterr()
+    assert "No search limit is set. Set using max_time or max_pipelines." in out