Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Random State #45

Merged
merged 13 commits into from Sep 4, 2019
3 changes: 2 additions & 1 deletion evalml/models/auto_base.py
Expand Up @@ -37,7 +37,8 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
self.results = {}
self.trained_pipelines = {}
self.random_state = random_state

random.seed(self.random_state)
np.random.seed(seed=self.random_state)
self.possible_model_types = list(set([p.model_type for p in self.possible_pipelines]))

self.tuners = {}
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/pipeline_base.py
Expand Up @@ -27,7 +27,7 @@ def fit(self, X, y, objective_fit_size=.2):
self.input_feature_names = X.columns.tolist()

if self.objective.needs_fitting:
X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size)
X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size, random_state=self.random_state)

self.pipeline.fit(X, y)

Expand Down
31 changes: 30 additions & 1 deletion evalml/tests/test_autoclassifier.py
Expand Up @@ -2,7 +2,7 @@
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit

from evalml import AutoClassifier
from evalml.objectives import Precision
from evalml.objectives import FraudCost, Precision
from evalml.pipelines import PipelineBase, get_pipelines


Expand Down Expand Up @@ -88,6 +88,33 @@ def test_specify_objective(X_y):
clf.fit(X, y)


def test_random_state(X_y):
X, y = X_y

fc = FraudCost(
retry_percentage=.5,
interchange_fee=.02,
fraud_payout_percentage=.75,
amount_col=10
)

clf = AutoClassifier(objective=Precision(), max_pipelines=5, random_state=0)
clf.fit(X, y)

clf_1 = AutoClassifier(objective=Precision(), max_pipelines=5, random_state=0)
clf_1.fit(X, y)
assert clf.rankings.equals(clf_1.rankings)

# test an objective that requires fitting
clf = AutoClassifier(objective=fc, max_pipelines=5, random_state=30)
clf.fit(X, y)

clf_1 = AutoClassifier(objective=fc, max_pipelines=5, random_state=30)
clf_1.fit(X, y)

assert clf.rankings.equals(clf_1.rankings)


def test_callback(X_y):
X, y = X_y

Expand All @@ -110,3 +137,5 @@ def add_result_callback(results, trained_pipeline, counts=counts):

assert counts["start_iteration_callback"] == max_pipelines
assert counts["add_result_callback"] == max_pipelines

# def test_serialization(trained_model)
12 changes: 12 additions & 0 deletions evalml/tests/test_autoregressor.py
Expand Up @@ -38,6 +38,18 @@ def test_init(X_y):
clf.describe_pipeline(0)


def test_random_state(X_y):
X, y = X_y
clf = AutoRegressor(objective="R2", max_pipelines=5, random_state=0)
clf.fit(X, y)

clf_1 = AutoRegressor(objective="R2", max_pipelines=5, random_state=0)
clf_1.fit(X, y)

# need to use assert_frame_equal as R2 could be different at the 10+ decimal
assert pd.testing.assert_frame_equal(clf.rankings, clf_1.rankings) is None


def test_callback(X_y):
X, y = X_y

Expand Down
24 changes: 24 additions & 0 deletions evalml/tests/test_pipelines.py
Expand Up @@ -2,11 +2,14 @@
import os
import shutil

import pandas as pd
import pytest
from sklearn import datasets

import evalml.tests as tests
from evalml import load_pipeline, save_pipeline
from evalml.objectives import FraudCost
from evalml.pipelines import LogisticRegressionPipeline
from evalml.pipelines.utils import get_pipelines, list_model_types

CACHE = os.path.join(os.path.dirname(tests.__file__), '.cache')
Expand Down Expand Up @@ -49,3 +52,24 @@ def test_serialization(X_y, trained_model, path_management):
pipeline = trained_model.best_pipeline
save_pipeline(pipeline, path)
assert pipeline.score(X, y) == load_pipeline(path).score(X, y)


def test_reproducibility(X_y):
X, y = X_y
X = pd.DataFrame(X)
y = pd.Series(y)

objective = FraudCost(
retry_percentage=.5,
interchange_fee=.02,
fraud_payout_percentage=.75,
amount_col=10
)

clf = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
clf.fit(X, y)

clf_1 = LogisticRegressionPipeline(objective=objective, penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
clf_1.fit(X, y)

assert clf_1.score(X, y) == clf.score(X, y)