In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Get categories and extract files


In [None]:
import tarfile

tar_file = "/content/drive/MyDrive/speech_commands_v0.02.tar.gz"
extract_path = "/content/speech_commands_v0.02"

with tarfile.open(tar_file, "r:gz") as tar:
    tar.extractall(extract_path)

print("file extracted ")


file extracted 


In [None]:
import torch

# Check if GPU is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
import os
from sklearn.model_selection import train_test_split

DATASET_PATH = "/content/speech_commands_v0.02"
CATEGORIES = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]  # 10 target cat

X_paths = []
Y_labels = []

for label in CATEGORIES:
    label_path = os.path.join(DATASET_PATH, label)
    if os.path.isdir(label_path):
        for file_name in os.listdir(label_path):
            if file_name.endswith(".wav"):
                X_paths.append(os.path.join(label_path, file_name))
                Y_labels.append(CATEGORIES.index(label))

print(f"Total found {len(X_paths)} audio files")

# Split dataset: 80% train, 20% test
X_train_paths, X_test_paths, Y_train, Y_test = train_test_split(
    X_paths, Y_labels, test_size=0.2, random_state=2, stratify=Y_labels
)

print(f"Training set: {len(X_train_paths)} samples")
print(f"Testing set: {len(X_test_paths)} samples")

Total found 38546 audio files
Training set: 30836 samples
Testing set: 7710 samples


# Get Wav2Vec2.0 features

In [None]:
import librosa
import numpy as np
import torch #deeplearning
from transformers import Wav2Vec2Processor, Wav2Vec2Model #huggingface

# Wav2Vec2.0  model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)  # Move to GPU

#extract Wav2Vec2.0 features to a single audio file
def extract_wav2vec_features(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)  # Load the audio file
    input_values = processor(y, return_tensors="pt", sampling_rate=16000).input_values.to(device)  # Convert to model input format & to GPU

    with torch.no_grad():
        outputs = model(input_values)  # Run model & extract features

    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Apply mean pooling & to NumPy array

# Batch processing (speed up extraction)
def batch_extract_wav2vec_features(audio_paths, batch_size=8):
    features = []

    for i in range(0, len(audio_paths), batch_size):
        batch_paths = audio_paths[i:i+batch_size]
        batch_audio = [librosa.load(path, sr=16000)[0] for path in batch_paths]  # Load multiple audio files

        # Convert batch to model input format and move to GPU
        input_values = processor(batch_audio, return_tensors="pt", padding=True, sampling_rate=16000).input_values.to(device)

        with torch.no_grad():
            outputs = model(input_values)  # Run Wav2Vec2.0 model on the batch

        # Apply mean pooling and convert to NumPy array
        batch_features = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        features.extend(batch_features)


        # Print log only every 5000 samples
        if i % 5000 == 0 or i + batch_size >= len(audio_paths):
            print(f"Processed {i+len(batch_paths)}/{len(audio_paths)} samples...")

        #print(f"Processed {i+len(batch_paths)}/{len(audio_paths)} samples...")

    return np.array(features)

# Start feature extraction
print("Extracting Wav2Vec2.0 features")
X_train_wav2vec = batch_extract_wav2vec_features(X_train_paths, batch_size=8)

print(f"Extraction complete! Training dataset shape: {X_train_wav2vec.shape}")

print(f"------------------------------------------------------------------")
X_test_wav2vec = batch_extract_wav2vec_features(X_test_paths, batch_size=8)

print(f"Extraction complete! Test dataset shape: {X_test_wav2vec.shape}")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracting Wav2Vec2.0 features
Processed 8/30836 samples...
Processed 5008/30836 samples...
Processed 10008/30836 samples...
Processed 15008/30836 samples...
Processed 20008/30836 samples...
Processed 25008/30836 samples...
Processed 30008/30836 samples...
Processed 30836/30836 samples...
Extraction complete! Training dataset shape: (30836, 768)
------------------------------------------------------------------
Processed 8/7710 samples...
Processed 5008/7710 samples...
Processed 7710/7710 samples...
Extraction complete! Test dataset shape: (7710, 768)


# Training MLP Classifer

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Train MLP classifier
clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=200, solver='adam', batch_size=512, verbose = True, alpha = 0.005)#l2 regularization
clf.fit(X_train_wav2vec, Y_train)

# Evaluation
Y_pred = clf.predict(X_test_wav2vec)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Wav2Vec2.0 + MLP Test Accuracy: {accuracy:.4f}")


Iteration 1, loss = 1.31000024
Iteration 2, loss = 0.40335484
Iteration 3, loss = 0.31281104
Iteration 4, loss = 0.27912516
Iteration 5, loss = 0.26058057
Iteration 6, loss = 0.24713123
Iteration 7, loss = 0.23480921
Iteration 8, loss = 0.22682017
Iteration 9, loss = 0.21983283
Iteration 10, loss = 0.21255816
Iteration 11, loss = 0.20929381
Iteration 12, loss = 0.20154567
Iteration 13, loss = 0.19864978
Iteration 14, loss = 0.19388141
Iteration 15, loss = 0.19203957
Iteration 16, loss = 0.18872586
Iteration 17, loss = 0.18418554
Iteration 18, loss = 0.17986955
Iteration 19, loss = 0.17954967
Iteration 20, loss = 0.17653211
Iteration 21, loss = 0.17561824
Iteration 22, loss = 0.17320583
Iteration 23, loss = 0.17638767
Iteration 24, loss = 0.16687313
Iteration 25, loss = 0.16709057
Iteration 26, loss = 0.16389270
Iteration 27, loss = 0.16135906
Iteration 28, loss = 0.16228216
Iteration 29, loss = 0.16080513
Iteration 30, loss = 0.15917885
Iteration 31, loss = 0.16100347
Iteration 32, los



Wav2Vec2.0 + MLP Test Accuracy: 0.9419


In [None]:
train_acc = clf.score(X_train_wav2vec, Y_train)
test_acc = clf.score(X_test_wav2vec, Y_test)

print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Test Accuracy: {test_acc:.4f}")

✅ Training Accuracy: 0.9905
✅ Test Accuracy: 0.9419
