Skip to content

Commit

Permalink
Merge pull request #39 from arthurpaulino/issue-36
Browse files Browse the repository at this point in the history
test data can be None
  • Loading branch information
arthurpaulino committed Apr 18, 2019
2 parents 0729413 + a0ceaaa commit 344cda9
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 22 deletions.
2 changes: 1 addition & 1 deletion examples/ensembling.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
data = pd.read_csv('pulsar_stars.csv')
train_data, test_data = train_test_split(data, stratify=data['target_class'],
test_size=0.2, random_state=0)
engine.load_data(train_data, test_data, 'target_class')
engine.load_data(train_data, 'target_class', test_data)

# Starting the engine
print('Training...')
Expand Down
2 changes: 1 addition & 1 deletion examples/getting_started.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
test_size=0.2, random_state=0)

# Now we load the data and inform the name of the target column.
engine.load_data(train_data, test_data, 'target_class')
engine.load_data(train_data, 'target_class', test_data)

# Ready to roll. To check if it's running asynchronously, we will start it and
# then call `is_running` after 1 second.
Expand Down
2 changes: 1 addition & 1 deletion examples/lightgbm_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def predict_proba(self, X):
data = pd.read_csv('pulsar_stars.csv')
train_data, test_data = train_test_split(data, stratify=data['target_class'],
test_size=0.2, random_state=0)
engine.load_data(train_data, test_data, 'target_class')
engine.load_data(train_data, 'target_class', test_data)

print('Training...')
engine.restart()
Expand Down
2 changes: 1 addition & 1 deletion examples/on_improvement.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def on_improvement(status):
data = pd.read_csv('pulsar_stars.csv')
train_data, test_data = train_test_split(data, stratify=data['target_class'],
test_size=0.2, random_state=0)
engine.load_data(train_data, test_data, 'target_class')
engine.load_data(train_data, 'target_class', test_data)

# Starting the engine
engine.restart()
Expand Down
2 changes: 1 addition & 1 deletion examples/parameters_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def logistic_regression_parameters_rules(parameters):
data = pd.read_csv('pulsar_stars.csv')
train_data, test_data = train_test_split(data, stratify=data['target_class'],
test_size=0.2, random_state=0)
engine.load_data(train_data, test_data, 'target_class')
engine.load_data(train_data, 'target_class', test_data)

# Fire!
print('Training...')
Expand Down
2 changes: 1 addition & 1 deletion miraiml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
>>> from miraiml import HyperSearchSpace, Config, Engine
"""

__version__ = '1.0.1.3'
__version__ = '2.0.1.3'

from .main import HyperSearchSpace, Config, Engine

Expand Down
28 changes: 20 additions & 8 deletions miraiml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,14 @@ def predict(self, X_train, y_train, X_test, config):
:raises: ``RuntimeError``
"""
X_train, X_test = X_train[self.features], X_test[self.features]
X_train = X_train[self.features]
train_predictions = np.zeros(X_train.shape[0])
test_predictions = np.zeros(X_test.shape[0])

test_predictions = None
if not X_test is None:
X_test = X_test[self.features]
test_predictions = np.zeros(X_test.shape[0])

if config.problem_type == 'classification' and config.stratified:
fold = StratifiedKFold(n_splits=config.n_folds, shuffle=False)
elif config.problem_type == 'regression' or not config.stratified:
Expand All @@ -92,15 +97,18 @@ def predict(self, X_train, y_train, X_test, config):
try:
if config.problem_type == 'classification':
train_predictions[small_part] = model.predict_proba(X_train_small)[:,1]
test_predictions += model.predict_proba(X_test)[:,1]
if not X_test is None:
test_predictions += model.predict_proba(X_test)[:,1]
elif config.problem_type == 'regression':
train_predictions[small_part] = model.predict(X_train_small)
test_predictions += model.predict(X_test)
if not X_test is None:
test_predictions += model.predict(X_test)
except:
raise RuntimeError('Error when predicting with model class {}'.\
format(class_name))

test_predictions /= config.n_folds
if not X_test is None:
test_predictions /= config.n_folds
return (train_predictions, test_predictions, config.score_function(y_train,
train_predictions))

Expand Down Expand Up @@ -371,16 +379,20 @@ def ensemble(self, weights):
* ``test_predictions``: The ensemble predictions for the testing dataset
* ``score``: The score of the ensemble on the training dataset
"""
test_predictions = None
id = self.base_models_ids[0]
train_predictions = weights[id]*self.train_predictions_df[id]
test_predictions = weights[id]*self.test_predictions_df[id]
if not self.test_predictions_df[id] is None:
test_predictions = weights[id]*self.test_predictions_df[id]
weights_sum = weights[id]
for id in self.base_models_ids[1:]:
train_predictions += weights[id]*self.train_predictions_df[id]
test_predictions += weights[id]*self.test_predictions_df[id]
if not self.test_predictions_df[id] is None:
test_predictions += weights[id]*self.test_predictions_df[id]
weights_sum += weights[id]
train_predictions /= weights_sum
test_predictions /= weights_sum
if not test_predictions is None:
test_predictions /= weights_sum
return (train_predictions, test_predictions,
self.config.score_function(self.y_train, train_predictions))

Expand Down
26 changes: 18 additions & 8 deletions miraiml/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,24 +297,28 @@ def interrupt(self):
time.sleep(.1)
self.must_interrupt = False

def load_data(self, train_data, test_data, target_column, restart=False):
def load_data(self, train_data, target_column, test_data=None, restart=False):
"""
Interrupts the engine and loads a new pair of train/test datasets.
:type train_data: pandas.DataFrame
:param train_data: The training data.
:type test_data: pandas.DataFrame
:param test_data: The testing data.
:type target_column: str
:param target_column: The name of the target column.
:type test_data: pandas.DataFrame, optional, default=None
:param test_data: The testing data. Use the default value if you don't
need to make predictions for data with unknown labels.
:type restart: bool, optional, default=False
:param restart: Whether to restart the engine after updating data or not.
"""
if type(train_data) != pd.DataFrame or type(test_data) != pd.DataFrame:
raise TypeError('Data must be of type \'pandas.DataFrame\'')
if type(train_data) != pd.DataFrame:
raise TypeError('Training data must be an object of pandas.DataFrame')

if type(test_data) != type(None) and type(test_data) != pd.DataFrame:
raise TypeError('Testing data must be None or an object of pandas.DataFrame')

self.interrupt()
self.train_data = train_data
Expand Down Expand Up @@ -511,12 +515,18 @@ def request_status(self):
* ``'scores'``: A dictionary containing the score of each id
* ``'predictions'``: A ``pandas.Series`` object containing the\
predictions of the best id for the testing dataset
predictions of the best id for the testing dataset. If no testing
dataset was provided, the value associated with this key is None.
"""
if self.best_id is None:
return None

predictions = None
if not self.test_data is None:
predictions = self.test_predictions_df[self.best_id].copy()

return dict(
score = self.scores[self.best_id],
scores = self.scores.copy(),
predictions = self.test_predictions_df[self.best_id].copy()
predictions = predictions
)

0 comments on commit 344cda9

Please sign in to comment.