From dc8bd1d3d2f4895b096a443676b9192aba65277a Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Wed, 26 Jun 2024 14:52:37 +0530 Subject: [PATCH 01/14] Proximity Forest draft --- .../distance_based/_proximity_forest.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 aeon/classification/distance_based/_proximity_forest.py diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py new file mode 100644 index 0000000000..0df039d447 --- /dev/null +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -0,0 +1,81 @@ +"""Proximity Forest Classifier. + +The Proximity Forest is an ensemble of Proximity Trees. +""" + +from typing import Type, Union + +import numpy as np + +from aeon.classification.base import BaseClassifier +from aeon.classification.distance_based import ProximityTree + + +class ProximityForest(BaseClassifier): + """Proximity Forest Classifier. + + The Proximity Forest is an ensemble of Proximity Trees. + """ + + def __init__( + self, + n_trees=10, + n_splitters: int = 5, + max_depth: int = None, + min_samples_split: int = 2, + random_state: Union[int, Type[np.random.RandomState], None] = None, + n_jobs: int = 1, + ): + self.n_trees = n_trees + self.n_splitters = n_splitters + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.random_state = random_state + self.n_jobs = n_jobs + super().__init__() + + def _fit(self, X, y): + # Check dimension of X + if X.ndim == 3: + if X.shape[1] == 1: + X = np.squeeze(X, axis=1) + else: + raise ValueError("X should be univariate.") + + self.classes_ = list(np.unique(y)) + self.trees_ = [] + for _ in range(self.n_trees): + clf = ProximityTree( + n_splitters=self.n_splitters, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + random_state=self.random_state, + n_jobs=self.n_jobs, + ) + clf.fit(X, y) + self.trees_.append(clf) + + self._is_fitted = True + + def _predict_proba(self, X): + # Check dimension of X + if X.ndim == 3: + if X.shape[1] == 1: + X = np.squeeze(X, axis=1) + else: + raise ValueError("X should be univariate.") + + output_probas = [] + for i in range(self.n_trees): + proba = self.trees_[i].predict_proba(X) + output_probas.append(proba) + + output_probas = np.sum(output_probas, axis=0) + output_probas = np.divide(output_probas, self.n_trees) + return output_probas + + def _predict(self, X): + probas = self._predict_proba(X) + idx = np.argmax(probas, axis=1) + preds = np.asarray([self.classes_[x] for x in idx]) + return preds From 7406dff3f21a6fcb957f65f0b4d9b573406ad916 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Fri, 28 Jun 2024 15:57:19 +0530 Subject: [PATCH 02/14] Update init --- aeon/classification/distance_based/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/aeon/classification/distance_based/__init__.py b/aeon/classification/distance_based/__init__.py index 7c1c41bdf1..a2dfc26433 100644 --- a/aeon/classification/distance_based/__init__.py +++ b/aeon/classification/distance_based/__init__.py @@ -1,8 +1,14 @@ """Distance based time series classifiers.""" -__all__ = ["ElasticEnsemble", "KNeighborsTimeSeriesClassifier", "ProximityTree"] +__all__ = [ + "ElasticEnsemble", + "KNeighborsTimeSeriesClassifier", + "ProximityTree", + "ProximityForest", +] from aeon.classification.distance_based._elastic_ensemble import ElasticEnsemble +from aeon.classification.distance_based._proximity_forest import ProximityForest from aeon.classification.distance_based._proximity_tree import ProximityTree from aeon.classification.distance_based._time_series_neighbors import ( KNeighborsTimeSeriesClassifier, From d063b99fed776197fe0eeb95cbe25a5bc6e9ea91 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Tue, 2 Jul 2024 19:10:25 +0530 Subject: [PATCH 03/14] Tests for forest --- .../distance_based/_proximity_forest.py | 12 +++++- .../tests/test_proximity_forest.py | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 aeon/classification/distance_based/tests/test_proximity_forest.py diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 0df039d447..f77307e8ec 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -3,6 +3,8 @@ The Proximity Forest is an ensemble of Proximity Trees. """ +__all__ = ["ProximityForest"] + from typing import Type, Union import numpy as np @@ -17,6 +19,14 @@ class ProximityForest(BaseClassifier): The Proximity Forest is an ensemble of Proximity Trees. """ + _tags = { + "capability:multivariate": True, + "capability:unequal_length": True, + "capability:multithreading": True, + "algorithm_type": "distance", + "X_inner_type": ["np-list", "numpy3D"], + } + def __init__( self, n_trees=10, @@ -40,7 +50,7 @@ def _fit(self, X, y): if X.shape[1] == 1: X = np.squeeze(X, axis=1) else: - raise ValueError("X should be univariate.") + raise ValueError("X should be univariate") self.classes_ = list(np.unique(y)) self.trees_ = [] diff --git a/aeon/classification/distance_based/tests/test_proximity_forest.py b/aeon/classification/distance_based/tests/test_proximity_forest.py new file mode 100644 index 0000000000..f3d0e8696b --- /dev/null +++ b/aeon/classification/distance_based/tests/test_proximity_forest.py @@ -0,0 +1,37 @@ +"""Test for Proximity Forest.""" + +import pytest +from sklearn.metrics import accuracy_score + +from aeon.classification.distance_based import ProximityForest +from aeon.testing.data_generation import make_example_3d_numpy + + +@pytest.fixture +def time_series_dataset(): + """Generate time series dataset for testing.""" + n_samples = 100 # Total number of samples (should be even) + n_timepoints = 24 # Length of each time series + n_channels = 1 + data, labels = make_example_3d_numpy(n_samples, n_channels, n_timepoints) + return data, labels + + +def test_univariate(time_series_dataset): + """Test that the function gives appropriate error message.""" + X, y = time_series_dataset + X_multivariate = X.reshape((100, 2, 12)) + clf = ProximityForest(n_trees=5, random_state=42, n_jobs=2) + with pytest.raises(ValueError, match="X should be univariate"): + clf.fit(X_multivariate, y) + + +def test_proximity_forest(time_series_dataset): + """Test the fit method of ProximityTree.""" + X, y = time_series_dataset + clf = ProximityForest(n_trees=5, n_splitters=3, max_depth=4) + clf.fit(X, y) + X_test, y_test = time_series_dataset + y_pred = clf.predict(X_test) + score = accuracy_score(y_test, y_pred) + assert score >= 0.9 From 81f86eec4b3f751b13d6c4d03a3aa7ad17ce3298 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Tue, 2 Jul 2024 20:13:58 +0530 Subject: [PATCH 04/14] Docstring --- .../distance_based/_proximity_forest.py | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index f77307e8ec..48f81097be 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -16,7 +16,56 @@ class ProximityForest(BaseClassifier): """Proximity Forest Classifier. - The Proximity Forest is an ensemble of Proximity Trees. + The Proximity Forest is a distance-based classifier that creates an + ensemble of decision trees, where the splits are based on the + similarity between time series measured using various parameterised + distance measures. + + Parameters + ---------- + n_trees: int, default = 100 + The number of trees, by default an ensemble of 100 trees is formed. + n_splitters: int, default = 5 + The number of candidate splitters to be evaluated at each node. + max_depth: int, default = None + The maximum depth of the tree. If None, then nodes are expanded until all + leaves are pure or until all leaves contain less than min_samples_split samples. + min_samples_split: int, default = 2 + The minimum number of samples required to split an internal node. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + n_jobs : int, default = 1 + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Parameter for compatibility purposes, still unimplemented. + + Notes + ----- + For the Java version, see + `ProximityForest + `_. + + References + ---------- + .. [1] Lucas, B., Shifaz, A., Pelletier, C., O’Neill, L., Zaidi, N., Goethals, B., + Petitjean, F. and Webb, G.I., 2019. Proximity forest: an effective and scalable + distance-based classifier for time series. Data Mining and Knowledge Discovery, + 33(3), pp.607-635. + + Examples + -------- + >>> from aeon.datasets import load_unit_test + >>> from aeon.classification.distance_based import ProximityForest + >>> X_train, y_train = load_unit_test(split="train") + >>> X_test, y_test = load_unit_test(split="test") + >>> classifier = ProximityForest(n_trees = 10, n_splitters = 3) + >>> classifier.fit(X_train, y_train) + ProximityForest(...) + >>> y_pred = classifier.predict(X_test) """ _tags = { @@ -29,7 +78,7 @@ class ProximityForest(BaseClassifier): def __init__( self, - n_trees=10, + n_trees=100, n_splitters: int = 5, max_depth: int = None, min_samples_split: int = 2, From 1ab94aae496ce2c105042be0dd30d7aff0139f41 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Tue, 2 Jul 2024 22:10:31 +0530 Subject: [PATCH 05/14] Fix initialization error --- aeon/classification/distance_based/_proximity_forest.py | 2 +- .../distance_based/tests/test_proximity_forest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 48f81097be..91ee9df428 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -10,7 +10,7 @@ import numpy as np from aeon.classification.base import BaseClassifier -from aeon.classification.distance_based import ProximityTree +from aeon.classification.distance_based._proximity_tree import ProximityTree class ProximityForest(BaseClassifier): diff --git a/aeon/classification/distance_based/tests/test_proximity_forest.py b/aeon/classification/distance_based/tests/test_proximity_forest.py index f3d0e8696b..62c7c437a0 100644 --- a/aeon/classification/distance_based/tests/test_proximity_forest.py +++ b/aeon/classification/distance_based/tests/test_proximity_forest.py @@ -21,7 +21,7 @@ def test_univariate(time_series_dataset): """Test that the function gives appropriate error message.""" X, y = time_series_dataset X_multivariate = X.reshape((100, 2, 12)) - clf = ProximityForest(n_trees=5, random_state=42, n_jobs=2) + clf = ProximityForest(n_trees=5, random_state=42, n_jobs=-1) with pytest.raises(ValueError, match="X should be univariate"): clf.fit(X_multivariate, y) From 80f1ca8e3299cec53e0422b4ffd2c073ce4e4ba7 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Tue, 2 Jul 2024 22:34:53 +0530 Subject: [PATCH 06/14] Update tags --- aeon/classification/distance_based/_proximity_forest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 91ee9df428..66899179ef 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -69,11 +69,11 @@ class ProximityForest(BaseClassifier): """ _tags = { - "capability:multivariate": True, - "capability:unequal_length": True, + "capability:multivariate": False, + "capability:unequal_length": False, "capability:multithreading": True, "algorithm_type": "distance", - "X_inner_type": ["np-list", "numpy3D"], + "X_inner_type": ["numpy2D", "numpy3D"], } def __init__( From db136c603d318023ba50a239b4e5fb00efd77a6a Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Wed, 3 Jul 2024 09:30:47 +0530 Subject: [PATCH 07/14] Fix tests --- aeon/classification/distance_based/_proximity_forest.py | 2 +- .../distance_based/tests/test_proximity_forest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 66899179ef..ec58d2cb3f 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -99,7 +99,7 @@ def _fit(self, X, y): if X.shape[1] == 1: X = np.squeeze(X, axis=1) else: - raise ValueError("X should be univariate") + raise ValueError("X should be univariate.") self.classes_ = list(np.unique(y)) self.trees_ = [] diff --git a/aeon/classification/distance_based/tests/test_proximity_forest.py b/aeon/classification/distance_based/tests/test_proximity_forest.py index 62c7c437a0..435c70e91e 100644 --- a/aeon/classification/distance_based/tests/test_proximity_forest.py +++ b/aeon/classification/distance_based/tests/test_proximity_forest.py @@ -22,7 +22,7 @@ def test_univariate(time_series_dataset): X, y = time_series_dataset X_multivariate = X.reshape((100, 2, 12)) clf = ProximityForest(n_trees=5, random_state=42, n_jobs=-1) - with pytest.raises(ValueError, match="X should be univariate"): + with pytest.raises(ValueError): clf.fit(X_multivariate, y) From 4631e8d31850379482f56901d6a5bbf4d3004cb0 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Fri, 5 Jul 2024 16:01:50 +0530 Subject: [PATCH 08/14] Review comments resolved --- .../distance_based/_proximity_forest.py | 18 +----------------- docs/api_reference/classification.rst | 1 + 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index ec58d2cb3f..96b52637a5 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -73,7 +73,7 @@ class ProximityForest(BaseClassifier): "capability:unequal_length": False, "capability:multithreading": True, "algorithm_type": "distance", - "X_inner_type": ["numpy2D", "numpy3D"], + "X_inner_type": ["numpy2D"], } def __init__( @@ -94,13 +94,6 @@ def __init__( super().__init__() def _fit(self, X, y): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") - self.classes_ = list(np.unique(y)) self.trees_ = [] for _ in range(self.n_trees): @@ -114,16 +107,7 @@ def _fit(self, X, y): clf.fit(X, y) self.trees_.append(clf) - self._is_fitted = True - def _predict_proba(self, X): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") - output_probas = [] for i in range(self.n_trees): proba = self.trees_[i].predict_proba(X) diff --git a/docs/api_reference/classification.rst b/docs/api_reference/classification.rst index ab36e9a19b..107e00d423 100644 --- a/docs/api_reference/classification.rst +++ b/docs/api_reference/classification.rst @@ -73,6 +73,7 @@ Distance-based ElasticEnsemble KNeighborsTimeSeriesClassifier + ProximityForest ProximityTree Feature-based From b7d046183215b0a5e9c1bda47a858043c755b13d Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Fri, 5 Jul 2024 16:06:27 +0530 Subject: [PATCH 09/14] Review comments resolved --- .../distance_based/_proximity_tree.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_tree.py b/aeon/classification/distance_based/_proximity_tree.py index 3a32e4d077..1276cdf341 100644 --- a/aeon/classification/distance_based/_proximity_tree.py +++ b/aeon/classification/distance_based/_proximity_tree.py @@ -117,7 +117,7 @@ class ProximityTree(BaseClassifier): "capability:multivariate": False, "capability:unequal_length": False, "algorithm_type": "distance", - "X_inner_type": ["numpy2D", "numpy3D"], + "X_inner_type": ["numpy2D"], } def __init__( @@ -371,12 +371,6 @@ def _find_target_value(y): return mode_value def _fit(self, X, y): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") # Set the unique class labels self.classes_ = list(np.unique(y)) @@ -391,12 +385,6 @@ def _predict(self, X): return np.array([self.classes_[pred] for pred in predictions]) def _predict_proba(self, X): - # Check dimension of X - if X.ndim == 3: - if X.shape[1] == 1: - X = np.squeeze(X, axis=1) - else: - raise ValueError("X should be univariate.") # Get the unique class labels classes = self.classes_ class_count = len(classes) From 8905461e5da99ccc1e45d5f7951b9e6ee699e143 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Fri, 5 Jul 2024 19:14:52 +0530 Subject: [PATCH 10/14] Parallelization using joblib --- .../distance_based/_proximity_forest.py | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 96b52637a5..9c24aa4f89 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -8,6 +8,7 @@ from typing import Type, Union import numpy as np +from joblib import Parallel, delayed from aeon.classification.base import BaseClassifier from aeon.classification.distance_based._proximity_tree import ProximityTree @@ -95,28 +96,32 @@ def __init__( def _fit(self, X, y): self.classes_ = list(np.unique(y)) - self.trees_ = [] - for _ in range(self.n_trees): - clf = ProximityTree( - n_splitters=self.n_splitters, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - random_state=self.random_state, - n_jobs=self.n_jobs, - ) - clf.fit(X, y) - self.trees_.append(clf) + self.trees_ = Parallel(n_jobs=self.n_jobs)( + delayed(self._fit_tree)(X, y) for _ in range(self.n_trees) + ) + + def _fit_tree(self, X, y): + clf = ProximityTree( + n_splitters=self.n_splitters, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + random_state=self.random_state, + n_jobs=self.n_jobs, + ) + clf.fit(X, y) + return clf def _predict_proba(self, X): - output_probas = [] - for i in range(self.n_trees): - proba = self.trees_[i].predict_proba(X) - output_probas.append(proba) - + output_probas = Parallel(n_jobs=self.n_jobs)( + delayed(self._predict_proba_tree)(tree, X) for tree in self.trees_ + ) output_probas = np.sum(output_probas, axis=0) output_probas = np.divide(output_probas, self.n_trees) return output_probas + def _predict_proba_tree(self, tree, X): + return tree.predict_proba(X) + def _predict(self, X): probas = self._predict_proba(X) idx = np.argmax(probas, axis=1) From 2d74d4d3f8ed8041ca6b17eeb0443fb0ded81f83 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Mon, 8 Jul 2024 20:10:14 +0530 Subject: [PATCH 11/14] pickling objects --- .../distance_based/_proximity_forest.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 9c24aa4f89..0dae81aba2 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -97,33 +97,44 @@ def __init__( def _fit(self, X, y): self.classes_ = list(np.unique(y)) self.trees_ = Parallel(n_jobs=self.n_jobs)( - delayed(self._fit_tree)(X, y) for _ in range(self.n_trees) + delayed(_fit_tree)( + X, + y, + self.n_splitters, + self.max_depth, + self.min_samples_split, + self.random_state, + self.n_jobs, + ) + for _ in range(self.n_trees) ) - def _fit_tree(self, X, y): - clf = ProximityTree( - n_splitters=self.n_splitters, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - random_state=self.random_state, - n_jobs=self.n_jobs, - ) - clf.fit(X, y) - return clf - def _predict_proba(self, X): output_probas = Parallel(n_jobs=self.n_jobs)( - delayed(self._predict_proba_tree)(tree, X) for tree in self.trees_ + delayed(_predict_proba_tree)(tree, X) for tree in self.trees_ ) output_probas = np.sum(output_probas, axis=0) output_probas = np.divide(output_probas, self.n_trees) return output_probas - def _predict_proba_tree(self, tree, X): - return tree.predict_proba(X) - def _predict(self, X): probas = self._predict_proba(X) idx = np.argmax(probas, axis=1) preds = np.asarray([self.classes_[x] for x in idx]) return preds + + +def _fit_tree(X, y, n_splitters, max_depth, min_samples_split, random_state, n_jobs): + clf = ProximityTree( + n_splitters=n_splitters, + max_depth=max_depth, + min_samples_split=min_samples_split, + random_state=random_state, + n_jobs=n_jobs, + ) + clf.fit(X, y) + return clf + + +def _predict_proba_tree(tree, X): + return tree.predict_proba(X) From b7505adf1283df1ef156a0889a0d226a12f2ad34 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Thu, 11 Jul 2024 21:15:57 +0530 Subject: [PATCH 12/14] Parallel threading --- aeon/classification/distance_based/_proximity_forest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index 0dae81aba2..f6965f8d20 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -96,7 +96,7 @@ def __init__( def _fit(self, X, y): self.classes_ = list(np.unique(y)) - self.trees_ = Parallel(n_jobs=self.n_jobs)( + self.trees_ = Parallel(n_jobs=self._n_jobs, prefer="threads")( delayed(_fit_tree)( X, y, @@ -110,7 +110,7 @@ def _fit(self, X, y): ) def _predict_proba(self, X): - output_probas = Parallel(n_jobs=self.n_jobs)( + output_probas = Parallel(n_jobs=self._n_jobs, prefer="threads")( delayed(_predict_proba_tree)(tree, X) for tree in self.trees_ ) output_probas = np.sum(output_probas, axis=0) From c853e55bea9414aa343eda08686a179616a571b5 Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Thu, 11 Jul 2024 22:51:57 +0530 Subject: [PATCH 13/14] Using unit test dataset --- .../tests/test_proximity_forest.py | 26 ++++++------------- .../tests/test_proximity_tree.py | 26 ++++++------------- 2 files changed, 16 insertions(+), 36 deletions(-) diff --git a/aeon/classification/distance_based/tests/test_proximity_forest.py b/aeon/classification/distance_based/tests/test_proximity_forest.py index 435c70e91e..9575177327 100644 --- a/aeon/classification/distance_based/tests/test_proximity_forest.py +++ b/aeon/classification/distance_based/tests/test_proximity_forest.py @@ -4,34 +4,24 @@ from sklearn.metrics import accuracy_score from aeon.classification.distance_based import ProximityForest -from aeon.testing.data_generation import make_example_3d_numpy +from aeon.datasets import load_unit_test -@pytest.fixture -def time_series_dataset(): - """Generate time series dataset for testing.""" - n_samples = 100 # Total number of samples (should be even) - n_timepoints = 24 # Length of each time series - n_channels = 1 - data, labels = make_example_3d_numpy(n_samples, n_channels, n_timepoints) - return data, labels - - -def test_univariate(time_series_dataset): +def test_univariate(): """Test that the function gives appropriate error message.""" - X, y = time_series_dataset - X_multivariate = X.reshape((100, 2, 12)) + X, y = load_unit_test() + X_multivariate = X.reshape((-1, 2, 12)) clf = ProximityForest(n_trees=5, random_state=42, n_jobs=-1) with pytest.raises(ValueError): clf.fit(X_multivariate, y) -def test_proximity_forest(time_series_dataset): +def test_proximity_forest(): """Test the fit method of ProximityTree.""" - X, y = time_series_dataset + X_train, y_train = load_unit_test() + X_test, y_test = load_unit_test(split="test") clf = ProximityForest(n_trees=5, n_splitters=3, max_depth=4) - clf.fit(X, y) - X_test, y_test = time_series_dataset + clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = accuracy_score(y_test, y_pred) assert score >= 0.9 diff --git a/aeon/classification/distance_based/tests/test_proximity_tree.py b/aeon/classification/distance_based/tests/test_proximity_tree.py index 42510f0cd6..4b86f82840 100644 --- a/aeon/classification/distance_based/tests/test_proximity_tree.py +++ b/aeon/classification/distance_based/tests/test_proximity_tree.py @@ -6,17 +6,7 @@ from aeon.classification.distance_based import ProximityTree from aeon.classification.distance_based._proximity_tree import gini, gini_gain -from aeon.testing.data_generation import make_example_3d_numpy - - -@pytest.fixture -def time_series_dataset(): - """Generate time series dataset for testing.""" - n_samples = 100 # Total number of samples (should be even) - n_timepoints = 24 # Length of each time series - n_channels = 1 - data, labels = make_example_3d_numpy(n_samples, n_channels, n_timepoints) - return data, labels +from aeon.datasets import load_unit_test def test_gini(): @@ -110,9 +100,9 @@ def test_get_parameter_value(): assert measure_params["c"] in [10**i for i in range(-2, 3)] -def test_get_cadidate_splitter(time_series_dataset): +def test_get_cadidate_splitter(): """Test the method to generate candidate splitters.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree() splitter = clf._get_candidate_splitter(X, y) assert len(splitter) == 2 @@ -132,9 +122,9 @@ def test_get_cadidate_splitter(time_series_dataset): assert measure in expected_measures -def test_get_best_splitter(time_series_dataset): +def test_get_best_splitter(): """Test the method to get optimum splitter of a node.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree(n_splitters=3) splitter = clf._get_best_splitter(X, y) @@ -146,12 +136,12 @@ def test_get_best_splitter(time_series_dataset): assert len(splitter) == 2 -def test_proximity_tree(time_series_dataset): +def test_proximity_tree(): """Test the fit method of ProximityTree.""" - X, y = time_series_dataset + X, y = load_unit_test() clf = ProximityTree(n_splitters=3, max_depth=4) clf.fit(X, y) - X_test, y_test = time_series_dataset + X_test, y_test = load_unit_test(split="train") y_pred = clf.predict(X_test) score = accuracy_score(y_test, y_pred) assert score >= 0.9 From e5a095f44ce6dfa83021e09f574bb8a210708f5a Mon Sep 17 00:00:00 2001 From: Divya Tiwari Date: Mon, 15 Jul 2024 09:38:38 +0530 Subject: [PATCH 14/14] parallel_backend parameter --- .../distance_based/_proximity_forest.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/aeon/classification/distance_based/_proximity_forest.py b/aeon/classification/distance_based/_proximity_forest.py index f6965f8d20..c88d2b7823 100644 --- a/aeon/classification/distance_based/_proximity_forest.py +++ b/aeon/classification/distance_based/_proximity_forest.py @@ -43,6 +43,11 @@ class ProximityForest(BaseClassifier): ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. Parameter for compatibility purposes, still unimplemented. + parallel_backend : str, ParallelBackendBase instance or None, default=None + Specify the parallelisation backend implementation in joblib, if None a 'prefer' + value of "threads" is used by default. + Valid options are "loky", "multiprocessing", "threading" or a custom backend. + See the joblib Parallel documentation for more details. Notes ----- @@ -85,6 +90,7 @@ def __init__( min_samples_split: int = 2, random_state: Union[int, Type[np.random.RandomState], None] = None, n_jobs: int = 1, + parallel_backend=None, ): self.n_trees = n_trees self.n_splitters = n_splitters @@ -92,11 +98,14 @@ def __init__( self.min_samples_split = min_samples_split self.random_state = random_state self.n_jobs = n_jobs + self.parallel_backend = parallel_backend super().__init__() def _fit(self, X, y): self.classes_ = list(np.unique(y)) - self.trees_ = Parallel(n_jobs=self._n_jobs, prefer="threads")( + self.trees_ = Parallel( + n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" + )( delayed(_fit_tree)( X, y, @@ -110,9 +119,9 @@ def _fit(self, X, y): ) def _predict_proba(self, X): - output_probas = Parallel(n_jobs=self._n_jobs, prefer="threads")( - delayed(_predict_proba_tree)(tree, X) for tree in self.trees_ - ) + output_probas = Parallel( + n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" + )(delayed(_predict_proba_tree)(tree, X) for tree in self.trees_) output_probas = np.sum(output_probas, axis=0) output_probas = np.divide(output_probas, self.n_trees) return output_probas