automl · nabenabe0928 · Nov 9, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 27, 2021
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -36,7 +36,7 @@
     STRING_TO_TASK_TYPES,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
@@ -105,6 +105,8 @@ class BaseTask:
     Args:
         seed (int), (default=1): seed to be used for reproducibility.
         n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]): specifies configuration
             for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50): Number of models added to the ensemble built by
@@ -133,6 +135,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -151,6 +154,7 @@ def __init__(
     ) -> None:
         self.seed = seed
         self.n_jobs = n_jobs
+        self.n_threads = n_threads
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
         self.max_models_on_disc = max_models_on_disc
@@ -1064,6 +1068,28 @@ def _search(
 
         return self
 
+    def _get_fit_dictionary(
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        dataset: BaseDataset,
+        split_id: int = 0
+    ) -> Dict[str, Any]:
+        X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
+        y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
+        X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
+                                  'backend': self._backend,
+                                  'X_train': dataset.train_tensors[0].copy(),
+                                  'y_train': dataset.train_tensors[1].copy(),
+                                  'X_test': X_test,
+                                  'y_test': y_test,
+                                  'train_indices': dataset.splits[split_id][0],
+                                  'val_indices': dataset.splits[split_id][1],
+                                  'split_id': split_id,
+                                  'num_run': self._backend.get_next_num_run(),
+                                  })
+        X.update(self.pipeline_options)
+        return X
+
     def refit(
         self,
         dataset: BaseDataset,
@@ -1107,18 +1133,6 @@ def refit(
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
-        X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
-                                  'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0],
-                                  'y_train': dataset.train_tensors[1],
-                                  'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
-                                  'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
-                                  'train_indices': dataset.splits[split_id][0],
-                                  'val_indices': dataset.splits[split_id][1],
-                                  'split_id': split_id,
-                                  'num_run': self._backend.get_next_num_run(),
-                                  })
-        X.update(self.pipeline_options)
         if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
             self._load_models()
 
@@ -1134,6 +1148,10 @@ def refit(
             # try to fit the model. If it fails, shuffle the data. This
             # could alleviate the problem in algorithms that depend on
             # the ordering of the data.
+            X = self._get_fit_dictionary(
+                dataset_properties=dataset_properties,
+                dataset=dataset,
+                split_id=split_id)
             fit_and_suppress_warnings(self._logger, model, X, y=None)
 
         self._clean_logger()
@@ -1187,18 +1205,10 @@ def fit(self,
             pipeline.set_hyperparameters(pipeline_config)
 
         # initialise fit dictionary
-        X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
-                                  'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0],
-                                  'y_train': dataset.train_tensors[1],
-                                  'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
-                                  'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
-                                  'train_indices': dataset.splits[split_id][0],
-                                  'val_indices': dataset.splits[split_id][1],
-                                  'split_id': split_id,
-                                  'num_run': self._backend.get_next_num_run(),
-                                  })
-        X.update(self.pipeline_options)
+        X = self._get_fit_dictionary(
+            dataset_properties=dataset_properties,
+            dataset=dataset,
+            split_id=split_id)
 
         fit_and_suppress_warnings(self._logger, pipeline, X, y=None)
 

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -26,10 +26,10 @@ class TabularClassificationTask(BaseTask):
     """
     Tabular Classification API to the pipelines.
     Args:
-        seed (int):
-            seed to be used for reproducibility.
-        n_jobs (int), (default=1):
-            number of consecutive processes to spawn.
+        seed (int), (default=1): seed to be used for reproducibility.
+        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]):
             specifies configuration for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50):
@@ -64,6 +64,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -82,6 +83,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -26,8 +26,10 @@ class TabularRegressionTask(BaseTask):
     """
     Tabular Regression API to the pipelines.
     Args:
-        seed (int): seed to be used for reproducibility.
+        seed (int), (default=1): seed to be used for reproducibility.
         n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]): specifies configuration
             for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50): Number of models added to the ensemble built by
@@ -56,6 +58,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -74,6 +77,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py
@@ -39,19 +39,19 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> torch.nn.Sequential:
         self.config.update(
             {"num_units_%d" % (i): num for i, num in enumerate(neuron_counts)}
         )
-        # we are skipping the last layer, as the function get_shaped_neuron_counts
-        # is built for getting neuron counts, so it will add the out_features to
-        # the last layer. However, in dropout we dont want to have that, we just
-        # want to use the shape and not worry about the output.
         if self.config['use_dropout']:
+            # the last dropout ("neuron") value is skipped since it will be equal
+            # to output_feat, which is 0. This is also skipped when getting the
+            # n_units for the architecture, since, it is mostly implemented for the
+            # output layer, which is part of the head and not of the backbone.
             dropout_shape = get_shaped_neuron_counts(
-                self.config['resnet_shape'], 0, 0, 1000, self.config['num_groups'] + 1
+                shape=self.config['resnet_shape'],
+                in_feat=0,
+                out_feat=0,
+                max_neurons=self.config["max_dropout"],
+                layer_count=self.config['num_groups'] + 1,
             )[:-1]
 
-            dropout_shape = [
-                dropout / 1000 * self.config["max_dropout"] for dropout in dropout_shape
-            ]
-
             self.config.update(
                 {"dropout_%d" % (i + 1): dropout for i, dropout in enumerate(dropout_shape)}
             )

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -94,7 +94,7 @@ def backward(ctx: typing.Any,
 
 def shake_get_alpha_beta(is_training: bool, is_cuda: bool
                          ) -> typing.Tuple[torch.tensor, torch.tensor]:
-    if is_training:
+    if not is_training:
         result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5]))
         return result if not is_cuda else (result[0].cuda(), result[1].cuda())
 
@@ -118,11 +118,11 @@ def shake_drop_get_bl(
 ) -> torch.tensor:
     pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake)
 
-    if not is_training:
-        # Move to torch.randn(1) for reproducibility
-        bl = torch.tensor(1.0) if torch.randn(1) <= pl else torch.tensor(0.0)
     if is_training:
-        bl = torch.tensor(pl)
+        # Move to torch.rand(1) for reproducibility
+        bl = torch.as_tensor(1.0) if torch.rand(1) <= pl else torch.as_tensor(0.0)
+    else:
+        bl = torch.as_tensor(pl)
 
     if is_cuda:
         bl = bl.cuda()

diff --git a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
@@ -59,23 +59,36 @@ def get_hyperparameter_search_space(
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
-        min_num_layers, max_num_layers = num_layers.value_range
-        num_layers_hp = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        min_num_layers: int = num_layers.value_range[0]  # type: ignore
+        max_num_layers: int = num_layers.value_range[-1]  # type: ignore
+        num_layers_is_constant = (min_num_layers == max_num_layers)
 
+        num_layers_hp = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
         activation_hp = get_hyperparameter(activation, CategoricalHyperparameter)
+        cs.add_hyperparameter(num_layers_hp)
 
-        cs.add_hyperparameters([num_layers_hp, activation_hp])
-        cs.add_condition(CS.GreaterThanCondition(activation_hp, num_layers_hp, 1))
+        if not num_layers_is_constant:
+            cs.add_hyperparameter(activation_hp)
+            cs.add_condition(CS.GreaterThanCondition(activation_hp, num_layers_hp, 1))
+        elif max_num_layers > 1:
+            # only add activation if we have more than 1 layer
+            cs.add_hyperparameter(activation_hp)
 
-        for i in range(1, int(max_num_layers)):
-            num_units_search_space = HyperparameterSearchSpace(hyperparameter=f"units_layer_{i}",
-                                                               value_range=units_layer.value_range,
-                                                               default_value=units_layer.default_value,
-                                                               log=units_layer.log)
+        for i in range(1, max_num_layers + 1):
+            num_units_search_space = HyperparameterSearchSpace(
+                hyperparameter=f"units_layer_{i}",
+                value_range=units_layer.value_range,
+                default_value=units_layer.default_value,
+                log=units_layer.log,
+            )
             num_units_hp = get_hyperparameter(num_units_search_space, UniformIntegerHyperparameter)
             cs.add_hyperparameter(num_units_hp)
 
-            if i >= int(min_num_layers):
+            if i >= min_num_layers and not num_layers_is_constant:
+                # In the case of a constant, the max and min number of layers are the same.
+                # So no condition is needed. If it is not a constant but a hyperparameter,
+                # then a condition has to be made so that it accounts for the value of the
+                # hyperparameter.
                 cs.add_condition(CS.GreaterThanCondition(num_units_hp, num_layers_hp, i))
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/XavierInit.py b/autoPyTorch/pipeline/components/setup/network_initializer/XavierInit.py
@@ -24,7 +24,7 @@ def initialization(m: torch.nn.Module) -> None:
                               torch.nn.Conv2d,
                               torch.nn.Conv3d,
                               torch.nn.Linear)):
-                torch.nn.init.xavier_uniform_(m.weight.data)
+                torch.nn.init.xavier_normal(m.weight.data)
                 if m.bias is not None and self.bias_strategy == 'Zero':
                     torch.nn.init.constant_(m.bias.data, 0.0)
         return initialization
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -37,6 +37,7 @@ def __init__(self, batch_size: int = 64,
         self.batch_size = batch_size
         self.train_data_loader = None  # type: Optional[torch.utils.data.DataLoader]
         self.val_data_loader = None  # type: Optional[torch.utils.data.DataLoader]
+        self.test_data_loader: Optional[torch.utils.data.DataLoader] = None
 
         # We also support existing datasets!
         self.dataset = None
@@ -69,7 +70,8 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             np.ndarray: Transformed features
         """
         X.update({'train_data_loader': self.train_data_loader,
-                  'val_data_loader': self.val_data_loader})
+                  'val_data_loader': self.val_data_loader,
+                  'test_data_loader': self.test_data_loader})
         return X
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
@@ -112,7 +114,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             shuffle=True,
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
-            drop_last=X.get('drop_last', True),
+            drop_last=X.get('drop_last', False),
             collate_fn=custom_collate_fn,
         )
 
@@ -126,6 +128,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             collate_fn=custom_collate_fn,
         )
 
+        if X.get('X_test', None) is not None:
+            self.test_data_loader = self.get_loader(X=X['X_test'],
+                                                    y=X['y_test'],
+                                                    batch_size=self.batch_size)
+
         return self
 
     def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
@@ -137,6 +144,7 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
 
         dataset = BaseDataset(
             train_tensors=(X, y),
+            seed=self.random_state.get_state()[1][0],
             # This dataset is used for loading test data in a batched format
             train_transforms=self.test_transform,
             val_transforms=self.test_transform,

diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 
-import sklearn.preprocessing
 from sklearn.base import ClassifierMixin
 
 import torch
@@ -101,13 +100,8 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
         loader = self.named_steps['data_loader'].get_loader(X=X)
         pred = self.named_steps['network'].predict(loader)
         if isinstance(self.dataset_properties['output_shape'], int):
-            proba = pred[:, :self.dataset_properties['output_shape']]
-            normalizer = proba.sum(axis=1)[:, np.newaxis]
-            normalizer[normalizer == 0.0] = 1.0
-            proba /= normalizer
-
-            return proba
-
+            # The final layer is always softmax now (`pred` already gives pseudo proba)
+            return pred
         else:
             raise ValueError("Expected output_shape to be integer, got {},"
                              "Tabular Classification only supports 'binary' and 'multiclass' outputs"
@@ -149,11 +143,6 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n
                     pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None)
                     y[batch_from:batch_to] = pred_prob.astype(np.float32)
 
-        # Neural networks might not be fit to produce a [0-1] output
-        # For instance, after small number of epochs.
-        y = np.clip(y, 0, 1)
-        y = sklearn.preprocessing.normalize(y, axis=1, norm='l1')
-
         return y
 
     def score(self, X: np.ndarray, y: np.ndarray,

diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py
@@ -483,12 +483,12 @@ def test_dropout(self, resnet_shape):
         backbone = resnet_backbone.build_backbone((100, 5))
         dropout_probabilites = [resnet_backbone.config[key] for key in resnet_backbone.config if 'dropout_' in key]
         dropout_shape = get_shaped_neuron_counts(
-            resnet_shape, 0, 0, 1000, num_groups + 1
+            shape=resnet_shape,
+            in_feat=0,
+            out_feat=0,
+            max_neurons=max_dropout,
+            layer_count=num_groups + 1,
         )[:-1]
-
-        dropout_shape = [
-            dropout / 1000 * max_dropout for dropout in dropout_shape
-        ]
         blocks_dropout = []
         for block in backbone:
             if isinstance(block, torch.nn.Sequential):