In [1]:
pip install tensorflow tensorflow-io librosa


Collecting tensorflow-io
  Downloading tensorflow_io-0.37.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (14 kB)
Collecting tensorflow-io-gcs-filesystem==0.37.1 (from tensorflow-io)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (14 kB)
Downloading tensorflow_io-0.37.1-cp312-cp312-macosx_12_0_arm64.whl (31.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.8/31.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-macosx_12_0_arm64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io
Successfully installed tensorflow-io-0.37.1 tensorflow-io-gcs-filesystem-0.37.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49

In [9]:
import os
import tensorflow as tf
import librosa
import numpy as np

# Define the path to your dataset directory
data_dir = '/Users/aanishverma/Downloads/speech_commands_v0.02/'

# Filter out non-directory entries
commands = np.array([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])

print(f"Commands: {commands}")

# Load the WAV file and convert it into a spectrogram
def preprocess_audio(filepath, max_length=16000):
    audio, sample_rate = librosa.load(filepath, sr=16000)
    
    # Ensure consistent length (padding or truncating the audio signal)
    if len(audio) > max_length:
        audio = audio[:max_length]
    else:
        audio = np.pad(audio, (0, max_length - len(audio)))
    
    # Convert to a Mel spectrogram
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Add the channel dimension for Conv2D
    spectrogram_db = np.expand_dims(spectrogram_db, axis=-1)
    
    return spectrogram_db


# Example to process one file (Make sure the file exists)
example_file = os.path.join(data_dir, "yes", "0a7c2a8d_nohash_0.wav")
spectrogram = preprocess_audio(example_file)

print(f"Spectrogram shape: {spectrogram.shape}")



Commands: ['right' 'eight' 'cat' 'tree' 'backward' 'learn' 'bed' 'happy' 'go' 'dog'
 'no' 'wow' 'follow' 'nine' 'left' 'stop' 'three' '_background_noise_'
 'sheila' 'one' 'bird' 'zero' 'seven' 'up' 'visual' 'marvin' 'two' 'house'
 'down' 'six' 'yes' 'on' 'five' 'forward' 'off' 'four']
Spectrogram shape: (128, 32, 1)


In [18]:
from sklearn.preprocessing import LabelEncoder

# Convert string labels to integers using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)


In [5]:
from sklearn.model_selection import train_test_split
import random
def load_dataset(data_dir):
    dataset = []
    labels = []
    
    # Load each file and its label (the folder name)
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        
        # Skip files like .DS_Store and only process directories
        if not os.path.isdir(label_dir):
            continue
        
        for file in os.listdir(label_dir):
            if file.endswith('.wav'):
                file_path = os.path.join(label_dir, file)
                dataset.append(file_path)
                labels.append(label)
    
    return dataset, labels


# Load the dataset
dataset, labels = load_dataset(data_dir)

# Split into training and validation sets
train_files, val_files, train_labels, val_labels = train_test_split(dataset, labels, test_size=0.2, stratify=labels, random_state=42)

print(f"Training samples: {len(train_files)}")
print(f"Validation samples: {len(val_files)}")


Training samples: 84668
Validation samples: 21167


In [19]:
def decode_audio(filepath):
    audio_binary = tf.io.read_file(filepath)
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_spectrogram(waveform):
    # Transform the waveform into a spectrogram
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    return spectrogram

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

# Preprocess dataset and return both spectrogram and encoded label
def preprocess(file_path, label):
    spectrogram = preprocess_audio(file_path)
    label_encoded = label_encoder.transform([label])[0]
    return spectrogram, label_encoded

# Apply the preprocessing function to all the training and validation files
train_ds = [preprocess(file, label) for file, label in zip(train_files, train_labels)]
val_ds = [preprocess(file, label) for file, label in zip(val_files, val_labels)]

# Convert lists into numpy arrays
train_ds = np.array([x[0] for x in train_ds]), np.array([x[1] for x in train_ds])
val_ds = np.array([x[0] for x in val_ds]), np.array([x[1] for x in val_ds])



In [23]:
# Example CNN model
# Example CNN model with adjusted input shape
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(128, 32, 1)),  # Adjusted to 32 time steps
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')  # Output layers for your commands
])

# Compile the model
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

print(model.summary())


None


In [24]:
# Train the model with the processed data and encoded labels
history = model.fit(train_ds[0], train_ds[1], epochs=10, validation_data=(val_ds[0], val_ds[1]))



Epoch 1/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 68ms/step - accuracy: 0.4813 - loss: 2.1606 - val_accuracy: 0.7558 - val_loss: 0.8346
Epoch 2/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 61ms/step - accuracy: 0.8139 - loss: 0.6376 - val_accuracy: 0.8009 - val_loss: 0.6830
Epoch 3/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 62ms/step - accuracy: 0.8555 - loss: 0.4832 - val_accuracy: 0.8144 - val_loss: 0.6590
Epoch 4/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 60ms/step - accuracy: 0.8761 - loss: 0.4003 - val_accuracy: 0.8255 - val_loss: 0.6214
Epoch 5/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 61ms/step - accuracy: 0.8993 - loss: 0.3266 - val_accuracy: 0.8173 - val_loss: 0.7117
Epoch 6/10
[1m2646/2646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 60ms/step - accuracy: 0.9106 - loss: 0.2828 - val_accuracy: 0.8243 - val_loss: 0.689