Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for unlim pipelines with max_time limit #70

Merged
merged 24 commits into from
Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
377c5d9
Added support for unlim pipelines with max_time limit
christopherbunn Sep 13, 2019
27c69fc
Fixed lint errors
christopherbunn Sep 13, 2019
8265ebf
Increased number of test_binary_auto pipelines to 5
christopherbunn Sep 13, 2019
cde7f15
Fixed max_pipelines=None behavior and removed extraneous comment
christopherbunn Sep 13, 2019
d35a4a2
Revered some Autoclassifier tests to use max_pipelines=5
christopherbunn Sep 16, 2019
540e4fa
Merge branch 'unlim-pipelines' of github.com:FeatureLabs/evalml into …
christopherbunn Oct 8, 2019
4c8a7a6
Merge branch 'master' of github.com:FeatureLabs/evalml into unlim-pip…
christopherbunn Oct 14, 2019
c39b2d6
Changed the format of the progress logs for max_time
christopherbunn Oct 14, 2019
ed89083
Merge branch 'master' into unlim-pipelines
christopherbunn Oct 17, 2019
ebf5f50
Merge branch 'master' of github.com:FeatureLabs/evalml into unlim-pip…
christopherbunn Oct 22, 2019
dc8e8f3
Changed to new pbar format and modified error msg
christopherbunn Oct 22, 2019
9a25853
Updated notebook example to include search limit
christopherbunn Oct 23, 2019
747d905
Updated limit handling to allow for no time parameters
christopherbunn Oct 23, 2019
7373b65
Fixed lint errors
christopherbunn Oct 23, 2019
1668b05
Merge branch 'master' of github.com:FeatureLabs/evalml into unlim-pip…
christopherbunn Oct 28, 2019
234cecf
Updated changelog
christopherbunn Oct 28, 2019
06e51c1
Merge branch 'master' of github.com:FeatureLabs/evalml into unlim-pip…
christopherbunn Oct 29, 2019
76cae9e
Closed pbear on early termination and removed new_line
christopherbunn Oct 29, 2019
eb7f6f2
Status bar changes
christopherbunn Oct 31, 2019
1b29167
Merge branch 'master' into unlim-pipelines
christopherbunn Oct 31, 2019
e98dd26
Merge branch 'master' into unlim-pipelines
christopherbunn Oct 31, 2019
a397993
Fixed lint error
christopherbunn Oct 31, 2019
c17334e
Merge branch 'unlim-pipelines' of github.com:FeatureLabs/evalml into …
christopherbunn Oct 31, 2019
7dbc123
Updated test and removed elasped variable
christopherbunn Oct 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ licenses/
__pycache__/
*.py[cod]
*$py.class
**/.DS_Store
.DS_Store

# C extensions
*.so
Expand Down
20 changes: 10 additions & 10 deletions docs/source/automl/regression_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
"\n",
"Possible model types: random_forest\n",
"\n",
"✔ Random Forest w/ imputation: 0%| | Elapsed:00:06\n",
"✔ Random Forest w/ imputation: 20%|██ | Elapsed:00:11\n",
"✔ Random Forest w/ imputation: 40%|████ | Elapsed:00:18\n",
"✔ Random Forest w/ imputation: 60%|██████ | Elapsed:00:24\n",
"✔ Random Forest w/ imputation: 80%|████████ | Elapsed:00:31\n",
"✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:31\n",
"✔ Random Forest w/ imputation: 0%| | Elapsed:00:05\n",
"✔ Random Forest w/ imputation: 20%|██ | Elapsed:00:10\n",
"✔ Random Forest w/ imputation: 40%|████ | Elapsed:00:16\n",
"✔ Random Forest w/ imputation: 60%|██████ | Elapsed:00:22\n",
"✔ Random Forest w/ imputation: 80%|████████ | Elapsed:00:30\n",
"✔ Random Forest w/ imputation: 100%|██████████| Elapsed:00:30\n",
"\n",
"✔ Optimization finished\n"
]
Expand All @@ -45,7 +45,7 @@
"\n",
"X, y = evalml.demos.load_diabetes()\n",
"\n",
"clf = evalml.AutoRegressor(objective=\"R2\")\n",
"clf = evalml.AutoRegressor(objective=\"R2\", max_pipelines=5)\n",
"\n",
"clf.fit(X, y)"
]
Expand Down Expand Up @@ -161,7 +161,7 @@
{
"data": {
"text/plain": [
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x1306d7fd0>"
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
]
},
"execution_count": 3,
Expand All @@ -181,7 +181,7 @@
{
"data": {
"text/plain": [
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x1306d7fd0>"
"<evalml.pipelines.regression.random_forest.RFRegressionPipeline at 0x129924690>"
]
},
"execution_count": 4,
Expand Down Expand Up @@ -209,7 +209,7 @@
"Pipeline Name: Random Forest w/ imputation\n",
"Model type: ModelTypes.RANDOM_FOREST\n",
"Objective: R2 (greater is better)\n",
"Total training time (including CV): 6.6 seconds\n",
"Total training time (including CV): 5.8 seconds\n",
"\n",
"Parameters\n",
"==========\n",
Expand Down
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Changelog
* Added support for other units in max_time :pr:`125`
* Detect highly null columns :pr:`121`
* Added additional regression objectives :pr:`100`
* Added support for unlimited pipelines with a max_time limit :pr:`70`
* Fixes
* Reordered `describe_pipeline` :pr:`94`
* Added type check for model_type :pr:`109`
Expand Down
43 changes: 28 additions & 15 deletions evalml/models/auto_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,15 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
else:
self.logger.log("Lower score is better.\n")

self.logger.log("Searching up to %s pipelines. " % self.max_pipelines, new_line=False)
# Set default max_pipeline if none specified
if self.max_pipelines is None and self.max_time is None:
self.max_pipelines = 5
self.logger.log("No search limit is set. Set using max_time or max_pipelines.\n")

if self.max_pipelines:
self.logger.log("Searching up to %s pipelines. " % self.max_pipelines)
if self.max_time:
self.logger.log("Will stop searching for new pipelines after %d seconds.\n" % self.max_time)
else:
self.logger.log("No time limit is set. Set one using max_time parameter.\n")
self.logger.log("Possible model types: %s\n" % ", ".join([model.value for model in self.possible_model_types]))

if self.detect_label_leakage:
Expand All @@ -124,17 +128,25 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
if len(highly_null_columns) > 0:
self.logger.log("WARNING: {} columns are at least {}% null.".format(', '.join(highly_null_columns), self.null_threshold * 100))

pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc} {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
start = time.time()
for n in pbar:
elapsed = time.time() - start
if self.max_time and elapsed > self.max_time:
self.logger.log("\n\nMax time elapsed. Stopping search early.")
break
self._do_iteration(X, y, pbar, raise_errors)

pbar.close()

if self.max_pipelines is None:
start = time.time()
pbar = tqdm(total=self.max_time, disable=not self.verbose, file=stdout, bar_format='{desc} | Elapsed:{elapsed}')
pbar._instances.clear()
while time.time() - start <= self.max_time:
self._do_iteration(X, y, pbar, raise_errors)
pbar.close()
else:
pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc} {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
pbar._instances.clear()
kmax12 marked this conversation as resolved.
Show resolved Hide resolved
start = time.time()
for n in pbar:
elapsed = time.time() - start
if self.max_time and elapsed > self.max_time:
pbar.close()
self.logger.log("\n\nMax time elapsed. Stopping search early.")
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved
break
self._do_iteration(X, y, pbar, raise_errors)
pbar.close()
self.logger.log("\n✔ Optimization finished")

def check_multiclass(self, y):
Expand Down Expand Up @@ -191,7 +203,8 @@ def _do_iteration(self, X, y, pbar, raise_errors):
except Exception as e:
if raise_errors:
raise e
pbar.write(str(e))
if pbar:
pbar.write(str(e))
score = np.nan
other_scores = OrderedDict(zip([n.name for n in self.additional_objectives], [np.nan] * len(self.additional_objectives)))

Expand Down
2 changes: 1 addition & 1 deletion evalml/models/auto_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class AutoClassifier(AutoBase):
def __init__(self,
objective=None,
multiclass=False,
max_pipelines=5,
max_pipelines=None,
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved
max_time=None,
model_types=None,
cv=None,
Expand Down
2 changes: 1 addition & 1 deletion evalml/models/auto_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class AutoRegressor(AutoBase):

def __init__(self,
objective=None,
max_pipelines=5,
max_pipelines=None,
max_time=None,
model_types=None,
cv=None,
Expand Down
12 changes: 6 additions & 6 deletions evalml/tests/automl_tests/test_autoclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def test_init(X_y):
X, y = X_y

clf = AutoClassifier(multiclass=False)
clf = AutoClassifier(multiclass=False, max_pipelines=1)

# check loads all pipelines
assert get_pipelines(problem_type=ProblemTypes.BINARY) == clf.possible_pipelines
Expand Down Expand Up @@ -74,7 +74,7 @@ def test_init_select_model_types():

def test_max_pipelines(X_y):
X, y = X_y
max_pipelines = 6
max_pipelines = 5
clf = AutoClassifier(max_pipelines=max_pipelines)

clf.fit(X, y)
Expand All @@ -84,7 +84,7 @@ def test_max_pipelines(X_y):

def test_best_pipeline(X_y):
X, y = X_y
max_pipelines = 3
max_pipelines = 5
clf = AutoClassifier(max_pipelines=max_pipelines)

clf.fit(X, y)
Expand All @@ -100,7 +100,7 @@ def test_specify_objective(X_y):

def test_binary_auto(X_y):
X, y = X_y
clf = AutoClassifier(objective="recall", multiclass=False)
clf = AutoClassifier(objective="recall", multiclass=False, max_pipelines=5)
clf.fit(X, y)
y_pred = clf.best_pipeline.predict(X)
assert len(np.unique(y_pred)) == 2
Expand All @@ -117,13 +117,13 @@ def test_multi_error(X_y_multi):

def test_multi_auto(X_y_multi):
X, y = X_y_multi
clf = AutoClassifier(objective="recall_micro", multiclass=True)
clf = AutoClassifier(objective="recall_micro", multiclass=True, max_pipelines=5)
clf.fit(X, y)
y_pred = clf.best_pipeline.predict(X)
assert len(np.unique(y_pred)) == 3

objective = PrecisionMicro()
clf = AutoClassifier(objective=objective, multiclass=True)
clf = AutoClassifier(objective=objective, multiclass=True, max_pipelines=5)
clf.fit(X, y)
y_pred = clf.best_pipeline.predict(X)
assert len(np.unique(y_pred)) == 3
Expand Down
26 changes: 26 additions & 0 deletions evalml/tests/test_autobase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from evalml import AutoClassifier


def test_pipeline_limits(capsys, X_y):
X, y = X_y

clf = AutoClassifier(multiclass=False, max_pipelines=1)
clf.fit(X, y)
out, err = capsys.readouterr()
assert "Searching up to 1 pipelines. " in out

clf = AutoClassifier(multiclass=False, max_time=1)
clf.fit(X, y)
out, err = capsys.readouterr()
assert "Will stop searching for new pipelines after 1 seconds" in out

clf = AutoClassifier(multiclass=False, max_time=1, max_pipelines=5)
clf.fit(X, y)
out, err = capsys.readouterr()
assert "Searching up to 5 pipelines. " in out
assert "Will stop searching for new pipelines after 1 seconds" in out

clf = AutoClassifier(multiclass=False)
clf.fit(X, y)
out, err = capsys.readouterr()
assert "No search limit is set. Set using max_time or max_pipelines." in out