In [1]:
import numpy as np
import random

# Set a global random seed
SEED = 42
np.random.seed(SEED)
random.seed(SEED)


In [2]:
"""Hydra classifier.

Pipeline classifier using the Hydra transformer and RidgeClassifierCV estimator.
"""

__maintainer__ = ["MatthewMiddlehurst"]
__all__ = ["HydraClassifier"]

import numpy as np
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline

from aeon.classification import BaseClassifier
from aeon.transformations.collection.convolution_based._hydra import HydraTransformer


class HydraClassifier(BaseClassifier):
    """Hydra Classifier.

    The algorithm utilises convolutional kernels grouped into ``g`` groups per dilation
    with ``k`` kernels per group. It transforms input time series using these kernels
    and counts the kernels representing the closest match to the input at each time
    point. This counts for each group are then concatenated and used to train a linear
    classifier.

    The algorithm combines aspects of both Rocket (convolutional approach)
    and traditional dictionary methods (pattern counting), It extracts features from
    both the base series and first-order differences of the series.

    Parameters
    ----------
    n_kernels : int, default=8
        Number of kernels per group.
    n_groups : int, default=64
        Number of groups per dilation.
    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
        From sklearn documentation:
        If not given, all classes are supposed to have weight one.
        The “balanced” mode uses the values of y to automatically adjust weights
        inversely proportional to class frequencies in the input data as
        n_samples / (n_classes * np.bincount(y))
        The “balanced_subsample” mode is the same as “balanced” except that weights
        are computed based on the bootstrap sample for every tree grown.
        For multi-output, the weights of each column of y will be multiplied.
        Note that these weights will be multiplied with sample_weight (passed through
        the fit method) if sample_weight is specified.
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    random_state : int, RandomState instance or None, default=None
        If `int`, random_state is the seed used by the random number generator;
        If `RandomState` instance, random_state is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.

    Attributes
    ----------
    n_classes_ : int
        Number of classes. Extracted from the data.
    classes_ : ndarray of shape (n_classes_)
        Holds the label for each class.

    See Also
    --------
    HydraTransformer
    MultiRocketHydraClassifier

    Notes
    -----
    Original code: https://github.com/angus924/hydra

    References
    ----------
    .. [1] Dempster, A., Schmidt, D.F. and Webb, G.I., 2023. Hydra: Competing
        convolutional kernels for fast and accurate time series classification.
        Data Mining and Knowledge Discovery, pp.1-27.

    Examples
    --------
    >>> from aeon.classification.convolution_based import HydraClassifier
    >>> from aeon.testing.data_generation import make_example_3d_numpy
    >>> X, y = make_example_3d_numpy(n_cases=10, n_channels=1, n_timepoints=12,
    ...                              random_state=0)
    >>> clf = HydraClassifier(random_state=0)  # doctest: +SKIP
    >>> clf.fit(X, y)  # doctest: +SKIP
    HydraClassifier(random_state=0)
    >>> clf.predict(X)  # doctest: +SKIP
    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
    """

    _tags = {
        "capability:multivariate": True,
        "capability:multithreading": True,
        "algorithm_type": "convolution",
        "python_dependencies": "torch",
    }

    def __init__(
        self,
        n_kernels: int = 8,
        n_groups: int = 64,
        class_weight=None,
        n_jobs: int = 1,
        random_state=None,
    ):
        self.n_kernels = n_kernels
        self.n_groups = n_groups
        self.class_weight = class_weight
        self.n_jobs = n_jobs
        self.random_state = random_state

        super().__init__()

    def _fit(self, X, y):
        transform = HydraTransformer(
            n_kernels=self.n_kernels,
            n_groups=self.n_groups,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
        )

        self._clf = make_pipeline(
            transform,
            _SparseScaler(),
            RidgeClassifierCV(
                alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
            ),
        )
        self._clf.fit(X, y)

        return self

    def _predict(self, X) -> np.ndarray:
        return self._clf.predict(X)


class _SparseScaler:
    """Sparse Scaler for hydra transform."""

    def __init__(self, mask=True, exponent=4):
        self.mask = mask
        self.exponent = exponent

    def fit(self, X, y=None):
        X = X.clamp(0).sqrt()

        self.epsilon = (X == 0).float().mean(0) ** self.exponent + 1e-8

        self.mu = X.mean(0)
        self.sigma = X.std(0) + self.epsilon

    def transform(self, X, y=None):
        X = X.clamp(0).sqrt()

        if self.mask:
            return ((X - self.mu) * (X != 0)) / self.sigma
        else:
            return (X - self.mu) / self.sigma

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [3]:
import os
import numpy as np
import librosa

def load_data_from_directory(directory, sample_length=16000, n_channels=1):
    X = []
    y = []
    labels = sorted(os.listdir(directory))
    label_map = {label: idx for idx, label in enumerate(labels)}  # Create a label to index mapping

    total_files = 0  # Initialize a counter for total files

    for label in labels:
        class_dir = os.path.join(directory, label)
        if os.path.isdir(class_dir):
            file_count = 0  # Counter for files in the current class directory
            for file_name in os.listdir(class_dir):
                if file_name.endswith('.wav'):
                    file_count += 1  # Increment file count for the current class
                    file_path = os.path.join(class_dir, file_name)
                    # Load audio
                    signal, sr = librosa.load(file_path, sr=16000)
                    # Ensure length is 1 second (16000 samples)
                    if len(signal) != sample_length:
                        # Pad or truncate to sample_length
                        if len(signal) < sample_length:
                            signal = np.pad(signal, (0, sample_length - len(signal)))
                        else:
                            signal = signal[:sample_length]
                    # Reshape signal to match (n_channels, n_timepoints)
                    X.append(signal.reshape(n_channels, -1))  # Reshape to (1, sample_length)
                    y.append(label_map[label])  # Use the index of the label

            total_files += file_count  # Add the current class file count to the total
            print(f"Total number of files in folder '{label}': {file_count}")

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    print(X.shape)
    print(y.shape)
    print(y)
    print(f"Overall total number of files in the dataset: {total_files}")

    return X, y, labels

# Load and preprocess data
directory = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/12KWS"
X, y, labels = load_data_from_directory(directory)


Total number of files in folder 'down': 2359
Total number of files in folder 'go': 2372
Total number of files in folder 'left': 2353
Total number of files in folder 'no': 2375
Total number of files in folder 'off': 2357
Total number of files in folder 'on': 2367
Total number of files in folder 'right': 2367
Total number of files in folder 'silence': 2010
Total number of files in folder 'stop': 2380
Total number of files in folder 'unknown': 2000
Total number of files in folder 'up': 2375
Total number of files in folder 'yes': 2377
(27692, 1, 16000)
(27692,)
[ 0  0  0 ... 11 11 11]
Overall total number of files in the dataset: 27692


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:



# Split data into training and testing sets with a fixed random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)






In [6]:
# Initialize the Classifier

clf = HydraClassifier(
    random_state=SEED
)







In [7]:
# Fit the classifier on the training data
clf.fit(X_train, y_train)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 362954752000 bytes.

In [None]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data:", accuracy)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and f1 score for each class (macro average)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Precision (macro): {precision}")
print(f"Recall (macro): {recall}")
print(f"F1 Score (macro): {f1}")





In [None]:
#confusion matrix

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, confusion_matrix, ConfusionMatrixDisplay


# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


