In [7]:
import os
import numpy as np
import scipy.io.wavfile as wav
from scipy.fft import fft
from scipy.spatial.distance import euclidean
from python_speech_features import mfcc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [13]:
# Step 1: Divide the signal into frames
def frame_signal(signal, frame_length, frame_step):
    num_samples = len(signal)
    num_frames = 1 + int(np.ceil((num_samples - frame_length) / frame_step))
    frames = np.zeros((num_frames, frame_length))

    for i in range(num_frames):
        start = i * frame_step
        end = min(start + frame_length, num_samples)
        frames[i, :end - start] = signal[start:end]

    return frames

In [14]:
# Step 2: Compute spectrum using FFT for each frame
def compute_spectrum(frames):
    return np.abs(fft(frames))

In [15]:
# Step 3: Compute average frames for each speech class in the training set
def compute_average_frames(train_data, train_labels, classes):
    class_avg_frames = {}

    for class_name in classes:
        class_indices = np.where(train_labels == class_name)[0]
        class_frames = train_data[class_indices]
        avg_frame = np.mean(class_frames, axis=0)
        class_avg_frames[class_name] = avg_frame

    return class_avg_frames

In [16]:

# Step 4: Compute features for the test wave file and compute distances for each speech class
def compute_distances(test_features, class_avg_frames):
    distances = {}

    for class_name, avg_frame in class_avg_frames.items():

        flat_test_features = test_features.flatten()
        flat_avg_frame = avg_frame.flatten()

        distance = euclidean(flat_test_features, flat_avg_frame)
        distances[class_name] = distance
        print(f"Distance to class '{class_name}': {distance}")

    return distances

In [17]:
# Step 5: Pick the class with the smallest distance
def predict_class(distances):
    return min(distances, key=distances.get)

# Load the dataset
data_dir = r'D:\\jupyter.ipynb\\mini_speech\\mini_speech_commands'
classes = os.listdir(data_dir)

train_data = []
train_labels = []
max_length = 100  # Define the maximum length for features

# Extract features for the training set
for class_name in classes:
    class_dir = os.path.join(data_dir, class_name)
    filenames = os.listdir(class_dir)

    for filename in filenames:
        if filename.endswith('.wav'):
            filepath = os.path.join(class_dir, filename)
            _, signal = wav.read(filepath)
            features = mfcc(signal)
            
            if len(features) < max_length:
                features = np.pad(features, ((0, max_length - len(features)), (0, 0)), mode='constant')
            else:
                features = features[:max_length, :]
            train_data.append(features)
            train_labels.append(class_name)

# Convert lists to numpy arrays
train_data = np.array(train_data)
train_labels = np.array(train_labels)

# Reshape the features to have a consistent shape
num_samples, num_frames, num_features = train_data.shape[0], train_data.shape[1], train_data.shape[2]
train_data_reshaped = train_data.reshape(num_samples, num_frames * num_features)

# Preprocess data
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data_reshaped)


test_size = 0.2
random_state = 42
train_data_split, test_data_split, train_labels_split, test_labels_split = train_test_split(
    train_data_scaled, train_labels, test_size=test_size, random_state=random_state
)
class_avg_frames = compute_average_frames(train_data_split, train_labels_split, classes)

# Train a simple KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_data_split, train_labels_split)

# Evaluate the model on the testing set
test_predictions = knn.predict(test_data_split)
accuracy = accuracy_score(test_labels_split, test_predictions)
print(f"Accuracy on the testing set: {accuracy * 100:.2f}%")

# Example for testing a sample file
test_file = r'D:\\jupyter.ipynb\\mini_speech\\mini_speech_commands\\up\\0c5027de_nohash_0.wav'
test_rate, test_signal = wav.read(test_file)
test_features = mfcc(test_signal)
# Pad or truncate the test features to the defined maximum length
if len(test_features) < max_length:
    test_features = np.pad(test_features, ((0, max_length - len(test_features)), (0, 0)), mode='constant')
else:
    test_features = test_features[:max_length, :]

# Reshape and preprocess the test data
test_features_reshaped = test_features.reshape(1, -1)
test_features_scaled = scaler.transform(test_features_reshaped)

# Predict the class and print the Euclidean distance for each class
predicted_class = knn.predict(test_features_scaled)
print(f"The predicted class for '{test_file}' is: {predicted_class[0]}")

# Compute distances to each class
distances_to_classes = compute_distances(test_features_reshaped, class_avg_frames)

# Print distances for each class
for class_name, distance in distances_to_classes.items():
    print(f"Euclidean distance to class '{class_name}': {distance}")

Accuracy on the testing set: 63.00%
The predicted class for 'D:\\jupyter.ipynb\\mini_speech\\mini_speech_commands\\up\\0c5027de_nohash_0.wav' is: up
Distance to class 'down': 488.2716048918962
Distance to class 'go': 487.78051999504936
Distance to class 'left': 488.76282393436065
Distance to class 'no': 488.45731478534674
Distance to class 'right': 488.72223496138474
Distance to class 'stop': 485.50903171604784
Distance to class 'up': 486.69094788438383
Distance to class 'yes': 490.2543374655764
Euclidean distance to class 'down': 488.2716048918962
Euclidean distance to class 'go': 487.78051999504936
Euclidean distance to class 'left': 488.76282393436065
Euclidean distance to class 'no': 488.45731478534674
Euclidean distance to class 'right': 488.72223496138474
Euclidean distance to class 'stop': 485.50903171604784
Euclidean distance to class 'up': 486.69094788438383
Euclidean distance to class 'yes': 490.2543374655764
