Merge pull request #39 from arthurpaulino/issue-36

test data can be None
arthurpaulino · Apr 18, 2019 · 344cda9 · 344cda9
2 parents 0729413 + a0ceaaa
commit 344cda9
Show file tree

Hide file tree

Showing 8 changed files with 44 additions and 22 deletions.
diff --git a/examples/ensembling.py b/examples/ensembling.py
@@ -40,7 +40,7 @@
 data = pd.read_csv('pulsar_stars.csv')
 train_data, test_data = train_test_split(data, stratify=data['target_class'],
     test_size=0.2, random_state=0)
-engine.load_data(train_data, test_data, 'target_class')
+engine.load_data(train_data, 'target_class', test_data)
 
 # Starting the engine
 print('Training...')

diff --git a/examples/getting_started.py b/examples/getting_started.py
@@ -37,7 +37,7 @@
     test_size=0.2, random_state=0)
 
 # Now we load the data and inform the name of the target column.
-engine.load_data(train_data, test_data, 'target_class')
+engine.load_data(train_data, 'target_class', test_data)
 
 # Ready to roll. To check if it's running asynchronously, we will start it and
 # then call `is_running` after 1 second.

diff --git a/examples/lightgbm_wrapper.py b/examples/lightgbm_wrapper.py
@@ -91,7 +91,7 @@ def predict_proba(self, X):
 data = pd.read_csv('pulsar_stars.csv')
 train_data, test_data = train_test_split(data, stratify=data['target_class'],
     test_size=0.2, random_state=0)
-engine.load_data(train_data, test_data, 'target_class')
+engine.load_data(train_data, 'target_class', test_data)
 
 print('Training...')
 engine.restart()

diff --git a/examples/on_improvement.py b/examples/on_improvement.py
@@ -32,7 +32,7 @@ def on_improvement(status):
 data = pd.read_csv('pulsar_stars.csv')
 train_data, test_data = train_test_split(data, stratify=data['target_class'],
     test_size=0.2, random_state=0)
-engine.load_data(train_data, test_data, 'target_class')
+engine.load_data(train_data, 'target_class', test_data)
 
 # Starting the engine
 engine.restart()

diff --git a/examples/parameters_rules.py b/examples/parameters_rules.py
@@ -58,7 +58,7 @@ def logistic_regression_parameters_rules(parameters):
 data = pd.read_csv('pulsar_stars.csv')
 train_data, test_data = train_test_split(data, stratify=data['target_class'],
     test_size=0.2, random_state=0)
-engine.load_data(train_data, test_data, 'target_class')
+engine.load_data(train_data, 'target_class', test_data)
 
 # Fire!
 print('Training...')

diff --git a/miraiml/__init__.py b/miraiml/__init__.py
@@ -10,7 +10,7 @@
 >>> from miraiml import HyperSearchSpace, Config, Engine
 """
 
-__version__ = '1.0.1.3'
+__version__ = '2.0.1.3'
 
 from .main import HyperSearchSpace, Config, Engine
 

diff --git a/miraiml/core.py b/miraiml/core.py
@@ -69,9 +69,14 @@ def predict(self, X_train, y_train, X_test, config):
 
         :raises: ``RuntimeError``
         """
-        X_train, X_test = X_train[self.features], X_test[self.features]
+        X_train = X_train[self.features]
         train_predictions = np.zeros(X_train.shape[0])
-        test_predictions = np.zeros(X_test.shape[0])
+
+        test_predictions = None
+        if not X_test is None:
+            X_test = X_test[self.features]
+            test_predictions = np.zeros(X_test.shape[0])
+
         if config.problem_type == 'classification' and config.stratified:
             fold = StratifiedKFold(n_splits=config.n_folds, shuffle=False)
         elif config.problem_type == 'regression' or not config.stratified:
@@ -92,15 +97,18 @@ def predict(self, X_train, y_train, X_test, config):
             try:
                 if config.problem_type == 'classification':
                     train_predictions[small_part] = model.predict_proba(X_train_small)[:,1]
-                    test_predictions += model.predict_proba(X_test)[:,1]
+                    if not X_test is None:
+                        test_predictions += model.predict_proba(X_test)[:,1]
                 elif config.problem_type == 'regression':
                     train_predictions[small_part] = model.predict(X_train_small)
-                    test_predictions += model.predict(X_test)
+                    if not X_test is None:
+                        test_predictions += model.predict(X_test)
             except:
                 raise RuntimeError('Error when predicting with model class {}'.\
                     format(class_name))
 
-        test_predictions /= config.n_folds
+        if not X_test is None:
+            test_predictions /= config.n_folds
         return (train_predictions, test_predictions, config.score_function(y_train,
             train_predictions))
 
@@ -371,16 +379,20 @@ def ensemble(self, weights):
             * ``test_predictions``: The ensemble predictions for the testing dataset
             * ``score``: The score of the ensemble on the training dataset
         """
+        test_predictions = None
         id = self.base_models_ids[0]
         train_predictions = weights[id]*self.train_predictions_df[id]
-        test_predictions = weights[id]*self.test_predictions_df[id]
+        if not self.test_predictions_df[id] is None:
+            test_predictions = weights[id]*self.test_predictions_df[id]
         weights_sum = weights[id]
         for id in self.base_models_ids[1:]:
             train_predictions += weights[id]*self.train_predictions_df[id]
-            test_predictions += weights[id]*self.test_predictions_df[id]
+            if not self.test_predictions_df[id] is None:
+                test_predictions += weights[id]*self.test_predictions_df[id]
             weights_sum += weights[id]
         train_predictions /= weights_sum
-        test_predictions /= weights_sum
+        if not test_predictions is None:
+            test_predictions /= weights_sum
         return (train_predictions, test_predictions,
             self.config.score_function(self.y_train, train_predictions))
 

diff --git a/miraiml/main.py b/miraiml/main.py
@@ -297,24 +297,28 @@ def interrupt(self):
             time.sleep(.1)
         self.must_interrupt = False
 
-    def load_data(self, train_data, test_data, target_column, restart=False):
+    def load_data(self, train_data, target_column, test_data=None, restart=False):
         """
         Interrupts the engine and loads a new pair of train/test datasets.
 
         :type train_data: pandas.DataFrame
         :param train_data: The training data.
 
-        :type test_data: pandas.DataFrame
-        :param test_data: The testing data.
-
         :type target_column: str
         :param target_column: The name of the target column.
 
+        :type test_data: pandas.DataFrame, optional, default=None
+        :param test_data: The testing data. Use the default value if you don't
+            need to make predictions for data with unknown labels.
+
         :type restart: bool, optional, default=False
         :param restart: Whether to restart the engine after updating data or not.
         """
-        if type(train_data) != pd.DataFrame or type(test_data) != pd.DataFrame:
-            raise TypeError('Data must be of type \'pandas.DataFrame\'')
+        if type(train_data) != pd.DataFrame:
+            raise TypeError('Training data must be an object of pandas.DataFrame')
+
+        if type(test_data) != type(None) and type(test_data) != pd.DataFrame:
+            raise TypeError('Testing data must be None or an object of pandas.DataFrame')
 
         self.interrupt()
         self.train_data = train_data
@@ -511,12 +515,18 @@ def request_status(self):
             * ``'scores'``: A dictionary containing the score of each id
 
             * ``'predictions'``: A ``pandas.Series`` object containing the\
-                predictions of the best id for the testing dataset
+                predictions of the best id for the testing dataset. If no testing
+                dataset was provided, the value associated with this key is None.
         """
         if self.best_id is None:
             return None
+
+        predictions = None
+        if not self.test_data is None:
+            predictions = self.test_predictions_df[self.best_id].copy()
+
         return dict(
             score = self.scores[self.best_id],
             scores = self.scores.copy(),
-            predictions = self.test_predictions_df[self.best_id].copy()
+            predictions = predictions
         )