Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring base dataset #105

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c9d096a
Make sure the performance of pipeline is at least 0.8
franchuterivera Feb 2, 2021
085c1d5
Early stop fixes
franchuterivera Feb 2, 2021
8325ea9
Network Cleanup (#81)
bastiscode Feb 2, 2021
822b339
First documentation
franchuterivera Feb 2, 2021
162d6b8
Default to ubuntu-18.04
franchuterivera Feb 2, 2021
ea20d7e
Comment enhancements
franchuterivera Feb 2, 2021
364222a
Feature preprocessors, Loss strategies (#86)
ravinkohli Feb 9, 2021
f02c2ab
Validate the input to autopytorch
franchuterivera Feb 9, 2021
92c2cdb
Bug fixes after rebase
franchuterivera Feb 9, 2021
0a666f2
Move to new scikit learn
franchuterivera Feb 10, 2021
014a01f
Remove dangerous convert dtype
franchuterivera Feb 10, 2021
42b12f4
Try to remove random float error again and make data pickable
franchuterivera Feb 10, 2021
7d2272c
Tets pickle on versions higher than 3.6
franchuterivera Feb 10, 2021
e28e7d7
Tets pickle on versions higher than 3.6
franchuterivera Feb 10, 2021
00c51d9
Comment fixes
franchuterivera Feb 10, 2021
6d9da10
[REFACTORING]: no change in the functionalities, inputs, returns
nabenabe0928 Feb 19, 2021
bea1d3e
Modified an error message
nabenabe0928 Feb 19, 2021
a0e8a80
[Test error fix]: Fixed the error caused by flake8
nabenabe0928 Feb 22, 2021
782eaa0
[Test error fix]: Fixed the error caused by flake8
nabenabe0928 Feb 22, 2021
00aaaef
merged to refactor dev
nabenabe0928 Feb 22, 2021
eb95578
[PR response]: deleted unneeded changes from merge and fixed the doc-…
nabenabe0928 Feb 23, 2021
8398188
fixed the for loop in type_check based on samuel's review
nabenabe0928 Feb 26, 2021
edbbb29
deleted blank space pointed out by flake8
nabenabe0928 Feb 26, 2021
eac426d
modified the doc-string in TransformSubset in base_dataset.py
nabenabe0928 Mar 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
97 changes: 43 additions & 54 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,17 @@
)
from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix

BASE_DATASET_INPUT = Union[Tuple[np.ndarray, np.ndarray], Dataset]
BaseDatasetType = Union[Tuple[np.ndarray, np.ndarray], Dataset]


def check_valid_data(data: Any) -> None:
if not (hasattr(data, '__getitem__') and hasattr(data, '__len__')):
if not all(hasattr(data, attr) for attr in ['__getitem__', '__len__']):
raise ValueError(
'The specified Data for Dataset does either not have a __getitem__ or a __len__ attribute.')
'The specified Data for Dataset must have both __getitem__ and __len__ attribute.')


def type_check(train_tensors: BASE_DATASET_INPUT, val_tensors: Optional[BASE_DATASET_INPUT] = None) -> None:
def type_check(train_tensors: BaseDatasetType, val_tensors: Optional[BaseDatasetType] = None) -> None:
"""To avoid unexpected behavior, we use loops over indices."""
for i in range(len(train_tensors)):
check_valid_data(train_tensors[i])
if val_tensors is not None:
Expand All @@ -42,12 +43,20 @@ def type_check(train_tensors: BASE_DATASET_INPUT, val_tensors: Optional[BASE_DAT


class TransformSubset(Subset):
"""
Because the BaseDataset contains all the data (train/val/test), the transformations
have to be applied with some directions. That is, if yielding train data,
we expect to apply train transformation (which have augmentations exclusively).
"""Wrapper of BaseDataset for splitted datasets

Since the BaseDataset contains all the data points (train/val/test),
we require different transformation for each data point.
This class helps to take the subset of the dataset
with either training or validation transformation.

We achieve so by adding a train flag to the pytorch subset

Attributes:
dataset (BaseDataset/Dataset): Dataset to sample the subset
indices names (Sequence[int]): Indices to sample from the dataset
train (bool): If we apply train or validation transformation

"""

def __init__(self, dataset: Dataset, indices: Sequence[int], train: bool) -> None:
Expand All @@ -62,10 +71,10 @@ def __getitem__(self, idx: int) -> np.ndarray:
class BaseDataset(Dataset, metaclass=ABCMeta):
def __init__(
self,
train_tensors: BASE_DATASET_INPUT,
train_tensors: BaseDatasetType,
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
dataset_name: Optional[str] = None,
val_tensors: Optional[BASE_DATASET_INPUT] = None,
test_tensors: Optional[BASE_DATASET_INPUT] = None,
val_tensors: Optional[BaseDatasetType] = None,
test_tensors: Optional[BaseDatasetType] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
shuffle: Optional[bool] = True,
Expand Down Expand Up @@ -97,18 +106,15 @@ def __init__(
val_transforms (Optional[torchvision.transforms.Compose]):
Additional Transforms to be applied to the validation/test data
"""
if dataset_name is not None:
self.dataset_name = dataset_name
else:
self.dataset_name = hash_array_or_matrix(train_tensors[0])
self.dataset_name = dataset_name if dataset_name is not None \
else hash_array_or_matrix(train_tensors[0])

if not hasattr(train_tensors[0], 'shape'):
type_check(train_tensors, val_tensors)
self.train_tensors = train_tensors
self.val_tensors = val_tensors
self.test_tensors = test_tensors
self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
self.cross_validators: Dict[str, CROSS_VAL_FN] = {}
self.holdout_validators: Dict[str, HOLDOUT_FN] = {}
self.rand = np.random.RandomState(seed=seed)
self.rng = np.random.RandomState(seed=seed)
self.shuffle = shuffle
self.resampling_strategy = resampling_strategy
self.resampling_strategy_args = resampling_strategy_args
Expand All @@ -128,16 +134,8 @@ def __init__(
self.is_small_preprocess = True

# Make sure cross validation splits are created once
self.cross_validators = get_cross_validators(
CrossValTypes.stratified_k_fold_cross_validation,
CrossValTypes.k_fold_cross_validation,
CrossValTypes.shuffle_split_cross_validation,
CrossValTypes.stratified_shuffle_split_cross_validation
)
self.holdout_validators = get_holdout_validators(
HoldoutValTypes.holdout_validation,
HoldoutValTypes.stratified_holdout_validation
)
self.cross_validators = get_cross_validators(*CrossValTypes)
self.holdout_validators = get_holdout_validators(*HoldoutValTypes)
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
self.splits = self.get_splits_from_resampling_strategy()

# We also need to be able to transform the data, be it for pre-processing
Expand All @@ -146,19 +144,19 @@ def __init__(
self.val_transform = val_transforms

def update_transform(self, transform: Optional[torchvision.transforms.Compose],
train: bool = True,
) -> 'BaseDataset':
train: bool = True) -> 'BaseDataset':
"""
During the pipeline execution, the pipeline object might propose transformations
as a product of the current pipeline configuration being tested.

This utility allows to return a self with the updated transformation, so that
This utility allows to return self with the updated transformation, so that
a dataloader can yield this dataset with the desired transformations

Args:
transform (torchvision.transforms.Compose): The transformations proposed
by the current pipeline
train (bool): Whether to update the train or validation transform
transform (torchvision.transforms.Compose):
The transformations proposed by the current pipeline
train (bool):
Whether to update the train or validation transform

Returns:
self: A copy of the update pipeline
Expand All @@ -171,9 +169,9 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],

def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
"""
The base dataset uses a Subset of the data. Nevertheless, the base dataset expect
both validation and test data to be present in the same dataset, which motivated the
need to dynamically give train/test data with the __getitem__ command.
The base dataset uses a Subset of the data. Nevertheless, the base dataset expects
both validation and test data to be present in the same dataset, which motivates
the need to dynamically give train/test data with the __getitem__ command.

This method yields a datapoint of the whole data (after a Subset has selected a given
item, based on the resampling strategy) and applies a train/testing transformation, if any.
Expand All @@ -186,34 +184,24 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
A transformed single point prediction
"""

if hasattr(self.train_tensors[0], 'loc'):
X = self.train_tensors[0].iloc[[index]]
else:
X = self.train_tensors[0][index]
X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \
else self.train_tensors[0][index]

if self.train_transform is not None and train:
X = self.train_transform(X)
elif self.val_transform is not None and not train:
X = self.val_transform(X)

# In case of prediction, the targets are not provided
Y = self.train_tensors[1]
if Y is not None:
Y = Y[index]
else:
Y = None
Y = self.train_tensors[1][index] if self.train_tensors[1] is not None else None

return X, Y

def __len__(self) -> int:
return self.train_tensors[0].shape[0]

def _get_indices(self) -> np.ndarray:
if self.shuffle:
indices = self.rand.permutation(len(self))
else:
indices = np.arange(len(self))
return indices
return self.rng.permutation(len(self)) if self.shuffle else np.arange(len(self))

def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
"""
Expand Down Expand Up @@ -333,7 +321,7 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
return (TransformSubset(self, self.splits[split_id][0], train=True),
TransformSubset(self, self.splits[split_id][1], train=False))

def replace_data(self, X_train: BASE_DATASET_INPUT, X_test: Optional[BASE_DATASET_INPUT]) -> 'BaseDataset':
def replace_data(self, X_train: BaseDatasetType, X_test: Optional[BaseDatasetType]) -> 'BaseDataset':
"""
To speed up the training of small dataset, early pre-processing of the data
can be made on the fly by the pipeline.
Expand Down Expand Up @@ -361,7 +349,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
contain.

Returns:

dataset_properties (Dict[str, Any]):
Dict of the dataset properties.
"""
dataset_properties = dict()
for dataset_requirement in dataset_requirements:
Expand Down
1 change: 0 additions & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def get_fit_dictionary(X, y, validator, backend):
info = datamanager.get_required_dataset_info()

dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))

fit_dictionary = {
'X_train': datamanager.train_tensors[0],
'y_train': datamanager.train_tensors[1],
Expand Down