In [2]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

# Define the base path of your dataset
path_to_your_dataset = 'donateacry_corpus_cleaned_and_updated_data'

# List the subfolders corresponding to each class
subfolders = ['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']

# Initialize lists to store file paths and labels
data = []    # This list will store the paths of audio files
labels = []  # This list will store the corresponding labels (class names)

# Iterate through each subfolder
for label, subfolder in enumerate(subfolders):
    folder_path = os.path.join(path_to_your_dataset, subfolder)
    
    # Iterate through each audio file in the subfolder
    for audio_file in os.listdir(folder_path):
        if audio_file.endswith(".wav"):
            audio_path = os.path.join(folder_path, audio_file)
            data.append(audio_path)
            labels.append(subfolder)

# Convert labels to numbers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data, train_labels, test_labels = train_test_split(data, encoded_labels, test_size=0.2, random_state=42)

# Function to extract audio features (spectrograms)
def extract_features(audio_path, target_shape=(128, 128)):
    audio, _ = librosa.load(audio_path, res_type='kaiser_fast')
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=22050, n_mels=128, fmax=8000)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Ensure that all spectrograms have the same shape
    spectrogram_db_fixed = librosa.util.fix_length(spectrogram_db, size=target_shape[1], axis=1)
    
    return spectrogram_db_fixed

# Create training and testing sets
X_train = [extract_features(audio_path) for audio_path in train_data]
X_test = [extract_features(audio_path) for audio_path in test_data]

# Convert lists to NumPy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Add a dimension to indicate the channel (1 for grayscale)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))

# Convert labels to one-hot format
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

# Data Augmentation
data_augmentation = tf.keras.Sequential([
    layers.experimental.preprocessing.RandomZoom(0.1),
])

# Create the CNN model
model = models.Sequential()
model.add(data_augmentation)  # Add data augmentation
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(subfolders), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_one_hot, epochs=10, batch_size=32, validation_data=(X_test, y_test_one_hot))





Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [3]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test_one_hot)
print(f"Accuracy on the test set: {test_acc}")

Accuracy on the test set: 0.782608687877655


In [4]:
# Save the trained model using TensorFlow's save_model
model.save("audio_classification_model_cnn")

# Load the saved model
cnn_model = tf.keras.models.load_model("audio_classification_model_cnn")


INFO:tensorflow:Assets written to: audio_classification_model_cnn\assets


INFO:tensorflow:Assets written to: audio_classification_model_cnn\assets








# RNN

In [5]:
# Iterate through each subfolder
for label, subfolder in enumerate(subfolders):
    folder_path = os.path.join(path_to_your_dataset, subfolder)
    
    # Iterate through each audio file in the subfolder
    for audio_file in os.listdir(folder_path):
        if audio_file.endswith(".wav"):
            audio_path = os.path.join(folder_path, audio_file)
            data.append(audio_path)
            labels.append(subfolder)

# Convert labels to numbers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data, train_labels, test_labels = train_test_split(data, encoded_labels, test_size=0.2, random_state=42)

# Function to extract audio features (MFCCs)
def extract_features(audio_path):
    audio, _ = librosa.load(audio_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
    return mfccs

# Create training and testing sets
X_train = [extract_features(audio_path) for audio_path in train_data]
X_test = [extract_features(audio_path) for audio_path in test_data]

# Find the minimum shape of the MFCCs
min_shape = min([mfccs.shape[1] for mfccs in X_train + X_test])

# Function to reshape and truncate the MFCCs
def reshape_and_truncate(mfccs, target_shape):
    if mfccs.shape[1] > target_shape:
        return mfccs[:, :target_shape]
    else:
        return np.pad(mfccs, ((0, 0), (0, target_shape - mfccs.shape[1])))

# Apply reshape_and_truncate to all MFCCs
X_train = [reshape_and_truncate(mfccs, min_shape) for mfccs in X_train]
X_test = [reshape_and_truncate(mfccs, min_shape) for mfccs in X_test]

# Convert lists to NumPy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Reshape the input data for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2]))

# One-hot encode labels
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

# Create the LSTM model
model2 = models.Sequential()
model2.add(layers.LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model2.add(layers.Dense(len(subfolders), activation='softmax'))

# Compile the model
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model2.fit(X_train, y_train_one_hot, epochs=10, validation_data=(X_test, y_test_one_hot))

# Evaluate the model on the test set
test_loss, test_accuracy = model2.evaluate(X_test, y_test_one_hot)
print(f"Test Accuracy: {test_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9125683307647705
