### File:           Project_Model  
  
### Authors:        Brooke McWilliams, James Birch  
  
### Date Created:   11/19/2023  
  
### Last Modified:  12/04/2023  
  
### Description:    Strip features out of audio files using the librosa library and perform CNN modeling using the tensors and keras libraries  
<br>
<br>
<br>


### Audio Preprocessing

In [128]:
import os
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

Function to load the WAV audio file data using librosa 

In [129]:
def get_data(filepath):
    data, sampleRate = librosa.load(filepath)
    return data, sampleRate

Assign variables that will be used to store the data for modeling  
Go through the Crema directory and append each files data to these variables  

In [130]:
path = "./Crema Dataset/"

wav_data_mfcc_norm = []
wav_data_rms_norm = []
labelList = []
wav_data = []
fileName = []
sampleR = []

for file in os.listdir(path):
    if file.endswith(".wav"):               
        file_path = os.path.join(path, file)
        # Get audio data from each data file
        record, sr = get_data(file_path)
        wav_data.append(record)
        sampleR.append(sr)

        # Labels for each data file
        labelList.append(file.split('_')[2])

        # File names list for tacking
        fileName.append(file)

Find the max audio file length to use for padding so that the feature space is evenly distributed 

In [131]:
maxL = 0
minL = float('inf')
for arr in wav_data:
    length = len(arr)
    if length > maxL:
        maxL = length
    if length < minL:
        minL = length

padded_data = []
for signal in wav_data:
    padW = maxL - len(signal)
    padS = np.pad(signal, (0, padW), mode="constant")
    padded_data.append(padS)

Extract the features from the read audio file data and normalize them

In [132]:
for audio, rate in zip(padded_data, sampleR):
    wav_data_mfcc = librosa.feature.mfcc(y=audio, sr=rate, n_mfcc=13).flatten()
    wav_data_mfcc_norm.append((wav_data_mfcc - np.mean(wav_data_mfcc)) / np.std(wav_data_mfcc))

    wav_data_rms = librosa.feature.rms(y=audio, hop_length=200).flatten()
    wav_data_rms_norm.append((wav_data_rms - np.mean(wav_data_rms)) / np.std(wav_data_rms))

  wav_data_rms_norm.append((wav_data_rms - np.mean(wav_data_rms)) / np.std(wav_data_rms))


Create dataframes to help work with the features 

In [133]:
# MFCC
wave_mfcc_dataframe = pd.DataFrame(wav_data_mfcc_norm)
wave_mfcc_dataframe.insert(0, "ID/File", fileName)
wave_mfcc_dataframe.insert(1, "Labels", labelList)

#RMS
wave_rms_dataframe = pd.DataFrame(wav_data_rms_norm)
wave_rms_dataframe.insert(0, "ID/File", fileName)
wave_rms_dataframe.insert(1, "Labels", labelList)

# CNN Modeling

In [134]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Conv1D, Activation, BatchNormalization, Dropout, MaxPooling1D, Flatten, Dense
from keras.optimizers import Adam, RMSprop
from keras.optimizers.schedules import ExponentialDecay
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.regularizers import l1, l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Build the CNN model for training  

In [214]:
def build_model(X_train, y_train, X_test, y_test, num_label):

    model = Sequential()
    model.add(Conv1D(filters=256, kernel_size=5, padding='same', activation='relu', input_shape=(X_train.shape[1],1)))

    model.add(Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same', strides=2))
    
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same', strides=2))
    model.add(Dropout(0.2))
    
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same', strides=2))

    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_label, activation='softmax'))

    model.summary()

    opt = Adam(learning_rate=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model

Split the data into training and testing and encode the labels

## MFCC

In [212]:
X1 = wave_mfcc_dataframe.iloc[:, 2:]
y = wave_mfcc_dataframe['Labels']

encoder = LabelEncoder()
y = encoder.fit_transform(y)
num_label = len(pd.unique(y))
y = to_categorical(y, num_classes=6)

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

print(f"Shape of X1_train: {X1_train.shape}\n")
print(f"Shape of y_train: {y_train.shape}\n")
print(f"Shape of X1_test: {X1_test.shape}\n")
print(f"Shape of y_test: {y_test.shape}\n")
print(f"Number of labels: {num_label}\n")

Shape of X1_train: (5953, 2808)

Shape of y_train: (5953, 6)

Shape of X1_test: (1489, 2808)

Shape of y_test: (1489, 6)

Number of labels: 6



Send training data to model function and return compiled CNN model for MFCC  


In [None]:
model1 = build_model(X1_train, y_train, X1_test, y_test, num_label)


model1.fit(X1_train, y_train, epochs=100, batch_size=64, validation_data=(X1_test, y_test))
acc1 = model1.evaluate(X1_test, y_test)

print(f"Model Accuracy: {acc1[1]:.4f}\n")

## RMS

In [215]:
X2 = wave_rms_dataframe.iloc[:, 2:]
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

print(f"Shape of X2_train: {X2_train.shape}\n")
print(f"Shape of y_train: {y_train.shape}\n")
print(f"Shape of X2_test: {X2_test.shape}\n")
print(f"Shape of y_test: {y_test.shape}\n")

Shape of X2_train: (5953, 552)

Shape of y_train: (5953, 6)

Shape of X2_test: (1489, 552)

Shape of y_test: (1489, 6)



Send training data to model function and return compiled CNN model for RMS  


In [216]:
model2 = build_model(X2_train, y_train, X2_test, y_test, num_label)

model2.fit(X2_train, y_train, epochs=100, batch_size=64, validation_data=(X2_test, y_test))
acc2 = model2.evaluate(X2_test, y_test)

print(f"Model Accuracy: {acc2[1]:.4f}\n")

Model: "sequential_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_190 (Conv1D)         (None, 552, 256)          1536      
                                                                 
 conv1d_191 (Conv1D)         (None, 552, 256)          327936    
                                                                 
 max_pooling1d_142 (MaxPool  (None, 276, 256)          0         
 ing1D)                                                          
                                                                 
 conv1d_192 (Conv1D)         (None, 276, 128)          163968    
                                                                 
 max_pooling1d_143 (MaxPool  (None, 138, 128)          0         
 ing1D)                                                          
                                                                 
 dropout_48 (Dropout)        (None, 138, 128)        

KeyboardInterrupt: 