In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras import models, layers

In [4]:
!pip install pydub


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [5]:
# Load the speech_commands dataset builder
builder = tfds.builder('speech_commands')

# Download the dataset
builder.download_and_prepare()
train_dataset = builder.as_dataset(split='train')
num_classes = builder.info.features['label'].num_classes


Downloading and preparing dataset 2.37 GiB (download: 2.37 GiB, generated: 8.17 GiB, total: 10.53 GiB) to /root/tensorflow_datasets/speech_commands/0.0.3...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/85511 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incompleteA1F1M3/speech_commands-train.tfrecord*...:…

Generating validation examples...:   0%|          | 0/10102 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incompleteA1F1M3/speech_commands-validation.tfrecord…

Generating test examples...:   0%|          | 0/4890 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incompleteA1F1M3/speech_commands-test.tfrecord*...: …



Dataset speech_commands downloaded and prepared to /root/tensorflow_datasets/speech_commands/0.0.3. Subsequent calls will reuse this data.


In [6]:
def preprocess_audio(audio_data, target_sample_rate=16000, n_mfcc=13, max_frames=50):
    audio_array = audio_data.numpy().astype(np.float32) / 32767.0

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio_array, sr=target_sample_rate, n_mfcc=n_mfcc)

    # Pad or truncate the features to a fixed length
    if mfccs.shape[1] < max_frames:
        mfccs_padded = np.pad(mfccs, ((0, 0), (0, max_frames - mfccs.shape[1])))
    else:
        mfccs_padded = mfccs[:, :max_frames]

    return mfccs_padded


In [7]:
X_train = []
y_train = []

for example in train_dataset:
    audio_data = example['audio']
    label = example['label']

    # Preprocess the audio data
    processed_audio = preprocess_audio(audio_data)


    X_train.append(processed_audio)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [9]:
# Build a deep FFNN model
def build_deep_ffnn_model(input_shape, num_classes):
    model = models.Sequential()
    model.add(layers.Flatten(input_shape=input_shape))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Instantiate the model
input_shape = X_train.shape[1:]
model = build_deep_ffnn_model(input_shape, num_classes)

# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))

# Save the model with the minimum validation loss
model.save('classification_model.h5')

# Load the saved model
loaded_model = tf.keras.models.load_model('classification_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


In [11]:
def preprocess_and_predict(audio_file_path):
    # Load the audio file
    audio_data, sample_rate = librosa.load(audio_file_path, sr=None)

    # Preprocess the audio data
    processed_audio = preprocess_audio(tf.constant(audio_data), target_sample_rate=sample_rate)

    # Expand dimensions to match the model input shape
    processed_audio = np.expand_dims(processed_audio, axis=0)

    # Make predictions
    predictions = loaded_model.predict(processed_audio)

    # Convert predictions to class label
    predicted_class = np.argmax(predictions)

    return predicted_class


audio_file_path = '/content/go record.wav'
predicted_class = preprocess_and_predict(audio_file_path)

print("Predicted Class:", predicted_class)

Predicted Class: 11


In [13]:
# Evaluate the model on the validation set
eval_result = loaded_model.evaluate(X_val, y_val)

accuracy = eval_result[1]

print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.6480149626731873
