diff --git a/aeon/classification/distance_based/__init__.py b/aeon/classification/distance_based/__init__.py index 7c1c41bdf1..a2dfc26433 100644 --- a/aeon/classification/distance_based/__init__.py +++ b/aeon/classification/distance_based/__init__.py @@ -1,8 +1,14 @@ """Distance based time series classifiers.""" -__all__ = ["ElasticEnsemble", "KNeighborsTimeSeriesClassifier", "ProximityTree"] +__all__ = [ + "ElasticEnsemble", + "KNeighborsTimeSeriesClassifier", + "ProximityTree", + "ProximityForest", +] from aeon.classification.distance_based._elastic_ensemble import ElasticEnsemble +from aeon.classification.distance_based._proximity_forest import ProximityForest from aeon.classification.distance_based._proximity_tree import ProximityTree from aeon.classification.distance_based._time_series_neighbors import ( KNeighborsTimeSeriesClassifier, diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py new file mode 100644 index 0000000000..c88d2b7823 --- /dev/null +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -0,0 +1,149 @@ +"""Proximity Forest Classifier. + +The Proximity Forest is an ensemble of Proximity Trees. +""" + +__all__ = ["ProximityForest"] + +from typing import Type, Union + +import numpy as np +from joblib import Parallel, delayed + +from aeon.classification.base import BaseClassifier +from aeon.classification.distance_based._proximity_tree import ProximityTree + + +class ProximityForest(BaseClassifier): + """Proximity Forest Classifier. + + The Proximity Forest is a distance-based classifier that creates an + ensemble of decision trees, where the splits are based on the + similarity between time series measured using various parameterised + distance measures. + + Parameters + ---------- + n_trees: int, default = 100 + The number of trees, by default an ensemble of 100 trees is formed. + n_splitters: int, default = 5 + The number of candidate splitters to be evaluated at each node. + max_depth: int, default = None + The maximum depth of the tree. If None, then nodes are expanded until all + leaves are pure or until all leaves contain less than min_samples_split samples. + min_samples_split: int, default = 2 + The minimum number of samples required to split an internal node. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + n_jobs : int, default = 1 + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Parameter for compatibility purposes, still unimplemented. + parallel_backend : str, ParallelBackendBase instance or None, default=None + Specify the parallelisation backend implementation in joblib, if None a 'prefer' + value of "threads" is used by default. + Valid options are "loky", "multiprocessing", "threading" or a custom backend. + See the joblib Parallel documentation for more details. + + Notes + ----- + For the Java version, see + `ProximityForest + `_. + + References + ---------- + .. [1] Lucas, B., Shifaz, A., Pelletier, C., O’Neill, L., Zaidi, N., Goethals, B., + Petitjean, F. and Webb, G.I., 2019. Proximity forest: an effective and scalable + distance-based classifier for time series. Data Mining and Knowledge Discovery, + 33(3), pp.607-635. + + Examples + -------- + >>> from aeon.datasets import load_unit_test + >>> from aeon.classification.distance_based import ProximityForest + >>> X_train, y_train = load_unit_test(split="train") + >>> X_test, y_test = load_unit_test(split="test") + >>> classifier = ProximityForest(n_trees = 10, n_splitters = 3) + >>> classifier.fit(X_train, y_train) + ProximityForest(...) + >>> y_pred = classifier.predict(X_test) + """ + + _tags = { + "capability:multivariate": False, + "capability:unequal_length": False, + "capability:multithreading": True, + "algorithm_type": "distance", + "X_inner_type": ["numpy2D"], + } + + def __init__( + self, + n_trees=100, + n_splitters: int = 5, + max_depth: int = None, + min_samples_split: int = 2, + random_state: Union[int, Type[np.random.RandomState], None] = None, + n_jobs: int = 1, + parallel_backend=None, + ): + self.n_trees = n_trees + self.n_splitters = n_splitters + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.random_state = random_state + self.n_jobs = n_jobs + self.parallel_backend = parallel_backend + super().__init__() + + def _fit(self, X, y): + self.classes_ = list(np.unique(y)) + self.trees_ = Parallel( + n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" + )( + delayed(_fit_tree)( + X, + y, + self.n_splitters, + self.max_depth, + self.min_samples_split, + self.random_state, + self.n_jobs, + ) + for _ in range(self.n_trees) + ) + + def _predict_proba(self, X): + output_probas = Parallel( + n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" + )(delayed(_predict_proba_tree)(tree, X) for tree in self.trees_) + output_probas = np.sum(output_probas, axis=0) + output_probas = np.divide(output_probas, self.n_trees) + return output_probas + + def _predict(self, X): + probas = self._predict_proba(X) + idx = np.argmax(probas, axis=1) + preds = np.asarray([self.classes_[x] for x in idx]) + return preds + + +def _fit_tree(X, y, n_splitters, max_depth, min_samples_split, random_state, n_jobs): + clf = ProximityTree( + n_splitters=n_splitters, + max_depth=max_depth, + min_samples_split=min_samples_split, + random_state=random_state, + n_jobs=n_jobs, + ) + clf.fit(X, y) + return clf + + +def _predict_proba_tree(tree, X): + return tree.predict_proba(X) diff --git a/aeon/classification/distance_based/_proximity_tree.py b/aeon/classification/distance_based/_proximity_tree.py index 3a32e4d077..1276cdf341 100644 --- a/aeon/classification/distance_based/_proximity_tree.py +++ b/aeon/classification/distance_based/_proximity_tree.py @@ -117,7 +117,7 @@ class ProximityTree(BaseClassifier): "capability:multivariate": False, "capability:unequal_length": False, "algorithm_type": "distance", - "X_inner_type": ["numpy2D", "numpy3D"], + "X_inner_type": ["numpy2D"], } def __init__( @@ -371,12 +371,6 @@ def _find_target_value(y): return mode_value def _fit(self, X, y): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") # Set the unique class labels self.classes_ = list(np.unique(y)) @@ -391,12 +385,6 @@ def _predict(self, X): return np.array([self.classes_[pred] for pred in predictions]) def _predict_proba(self, X): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") # Get the unique class labels classes = self.classes_ class_count = len(classes) diff --git a/aeon/classification/distance_based/tests/test_proximity_forest.py b/aeon/classification/distance_based/tests/test_proximity_forest.py new file mode 100644 index 0000000000..9575177327 --- /dev/null +++ b/aeon/classification/distance_based/tests/test_proximity_forest.py @@ -0,0 +1,27 @@ +"""Test for Proximity Forest.""" + +import pytest +from sklearn.metrics import accuracy_score + +from aeon.classification.distance_based import ProximityForest +from aeon.datasets import load_unit_test + + +def test_univariate(): + """Test that the function gives appropriate error message.""" + X, y = load_unit_test() + X_multivariate = X.reshape((-1, 2, 12)) + clf = ProximityForest(n_trees=5, random_state=42, n_jobs=-1) + with pytest.raises(ValueError): + clf.fit(X_multivariate, y) + + +def test_proximity_forest(): + """Test the fit method of ProximityTree.""" + X_train, y_train = load_unit_test() + X_test, y_test = load_unit_test(split="test") + clf = ProximityForest(n_trees=5, n_splitters=3, max_depth=4) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + score = accuracy_score(y_test, y_pred) + assert score >= 0.9 diff --git a/aeon/classification/distance_based/tests/test_proximity_tree.py b/aeon/classification/distance_based/tests/test_proximity_tree.py index 42510f0cd6..4b86f82840 100644 --- a/aeon/classification/distance_based/tests/test_proximity_tree.py +++ b/aeon/classification/distance_based/tests/test_proximity_tree.py @@ -6,17 +6,7 @@ from aeon.classification.distance_based import ProximityTree from aeon.classification.distance_based._proximity_tree import gini, gini_gain -from aeon.testing.data_generation import make_example_3d_numpy - - -@pytest.fixture -def time_series_dataset(): - """Generate time series dataset for testing.""" - n_samples = 100 # Total number of samples (should be even) - n_timepoints = 24 # Length of each time series - n_channels = 1 - data, labels = make_example_3d_numpy(n_samples, n_channels, n_timepoints) - return data, labels +from aeon.datasets import load_unit_test def test_gini(): @@ -110,9 +100,9 @@ def test_get_parameter_value(): assert measure_params["c"] in [10**i for i in range(-2, 3)] -def test_get_cadidate_splitter(time_series_dataset): +def test_get_cadidate_splitter(): """Test the method to generate candidate splitters.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree() splitter = clf._get_candidate_splitter(X, y) assert len(splitter) == 2 @@ -132,9 +122,9 @@ def test_get_cadidate_splitter(time_series_dataset): assert measure in expected_measures -def test_get_best_splitter(time_series_dataset): +def test_get_best_splitter(): """Test the method to get optimum splitter of a node.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree(n_splitters=3) splitter = clf._get_best_splitter(X, y) @@ -146,12 +136,12 @@ def test_get_best_splitter(time_series_dataset): assert len(splitter) == 2 -def test_proximity_tree(time_series_dataset): +def test_proximity_tree(): """Test the fit method of ProximityTree.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree(n_splitters=3, max_depth=4) clf.fit(X, y) - X_test, y_test = time_series_dataset + X_test, y_test = load_unit_test(split="train") y_pred = clf.predict(X_test) score = accuracy_score(y_test, y_pred) assert score >= 0.9 diff --git a/docs/api_reference/classification.rst b/docs/api_reference/classification.rst index ab36e9a19b..107e00d423 100644 --- a/docs/api_reference/classification.rst +++ b/docs/api_reference/classification.rst @@ -73,6 +73,7 @@ Distance-based ElasticEnsemble KNeighborsTimeSeriesClassifier + ProximityForest ProximityTree Feature-based