Here, ANN model is trained on mel spectrogram calculated using librosa library. 
Mel feature contains both frequency and time aspects of the signal.

- neccessary import packages

In [1]:
import numpy as np
import librosa, os
from matplotlib import pyplot as plt
%matplotlib inline
import pathlib
from pydub import AudioSegment
import glob
from pathlib import Path
import os

### Place your dataset in the datas directory in the following format:

- Copy the data from dataset directory
- dataset directory contains data that is of 3/5 seconds interval. Choose the data of particular interval of interest.
- E.g. if you want to train the model on 3 seconds interval data, copy directories from 'data_3_sec' and place it in datas folder in following format:

datas

    |-IVR
        |-ivr.wav
        |-ivr1.wav
        |-ivr2.wav
        |.........

    |-Music
        |-music.wav
        |-music1.wav
        |-music2.wav
        |.........

    |-Speech
        |-speech.wav
        |-speech2.wav
        |-speech3.wav
        |.........

- join datas directory with absolute path

In [2]:
base_dir = pathlib.Path(__name__).parent.absolute()
pdf_folder_path = os.path.join(base_dir,'dataset')
pdf_folder_path

'/home/anush/Desktop/ac_new/dataset'

- create directory to store the spectrogram images

In [3]:
OUTPUT_DIR =base_dir
if not os.path.exists(os.path.join(OUTPUT_DIR, 'audio-data-images')):
    os.mkdir(os.path.join(OUTPUT_DIR, 'audio-data-images'))

- convert the mp3 files to wav, if any

In [4]:
for i in sorted(os.listdir(pdf_folder_path)):
    print(i)
    for j in glob.glob(os.path.join(OUTPUT_DIR, 'dataset',i,'*')):
        if j.endswith('.wav'):
            pass
        else:
            path =j
            output_file = os.path.splitext(path)[0]
            output_file= f'{output_file}.wav'
            sound = AudioSegment.from_mp3(path)
            sound.export(output_file, format="wav")
            os.remove(path)

IVR
Music
Silence
Speech


- convert the audio's to spectrogram images and save to audio-data-images directory

In [None]:
for i in sorted(os.listdir(pdf_folder_path)):
    for audio_file in glob.glob(os.path.join(OUTPUT_DIR, 'dataset',i,'*'))[:3400]:
        if not os.path.exists(os.path.join(OUTPUT_DIR, 'audio-data-images',i)):
            os.mkdir(os.path.join(OUTPUT_DIR, 'audio-data-images',i))
        file_stem = Path(audio_file).stem
        image_file =os.path.join(OUTPUT_DIR, 'audio-data-images',i,file_stem)
        file_exist = file_stem + '.png'
        
        if not os.path.exists(os.path.join(OUTPUT_DIR, 'audio-data-images',i,file_exist)):      
            fig = plt.figure()
            ax = fig.add_subplot(1, 1, 1)
            fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
            y, samp = librosa.load(audio_file, sr=8000)
            ms = librosa.feature.melspectrogram(y=y, sr=samp)
        
            log_ms = librosa.power_to_db(ms, ref=np.max)
            librosa.display.specshow(log_ms, sr=samp)

            fig.savefig(f'{image_file}.png')
            plt.close('all')

In [None]:
import matplotlib
matplotlib.__version__

- load images from path as array and append it to list and its labels in other list
- also show images after loading

In [None]:
# from keras.preprocessing import image
# from tensorflow.keras.preprocessing.image import img_to_array
import keras.utils as image
def load_images_from_path(path, label):
    images = []
    labels = []

    for file in os.listdir(path):
        images.append(image.img_to_array(image.load_img(os.path.join(path, file), target_size=(224, 224, 3))))
        labels.append((label))
        
    return images, labels

def show_images(images):
    fig, axes = plt.subplots(1, 8, figsize=(20, 20), subplot_kw={'xticks': [], 'yticks': []})

    for i, ax in enumerate(axes.flat):
        ax.imshow(images[i] / 255)
        
x = []
y = []

- pass the ivr dataset path and get the image array

In [None]:
music_path = os.path.join(base_dir,'audio-data-images','IVR')

images, labels = load_images_from_path(music_path, 0)
show_images(images)
    
x += images
y += labels

- pass the music dataset path and get the image array

In [None]:
Speech_path = os.path.join(base_dir,'audio-data-images','Music')
images, labels = load_images_from_path(Speech_path, 1)
show_images(images)
    
x += images
y += labels

- pass the speech dataset path and get the image array

In [None]:
Silence_path = os.path.join(base_dir,'audio-data-images','Silence')
images, labels = load_images_from_path(Silence_path, 2)
show_images(images)
    
x += images
y += labels

In [None]:
Speech_path = os.path.join(base_dir,'audio-data-images','Speech')
images, labels = load_images_from_path(Speech_path, 3)
show_images(images)
    
x += images
y += labels

- train test split at 70:30 ratio and label encoding to encode labels

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

x_train_norm = np.array(x_train) / 255
x_test_norm = np.array(x_test) / 255

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

In [None]:
y_test_encoded

- to preprocess image keras is used
- to extract features, mobilenetv2 is used

In [None]:
# base_model = VGG19( include_top=False, weights="imagenet",input_shape=(224, 224, 3)) #half size model to MV2

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet import preprocess_input

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x_train_norm = preprocess_input(np.array(x_train))
x_test_norm = preprocess_input(np.array(x_test))

train_features = base_model.predict(x_train_norm)
test_features = base_model.predict(x_test_norm)

### model creation & training

- sequential model is used with adam optimizer and categorical cross entropy

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense

model = Sequential()
model.add(Flatten(input_shape=train_features.shape[1:]))
model.add(Dense(1024, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

- model is trained with batch size of 10 and 10 epochs

In [None]:
hist = model.fit(train_features, y_train_encoded, validation_data=(test_features, y_test_encoded), batch_size=10, epochs=10)

In [None]:
model.save('ann_mobilenetv2_1sec_normalized.h5')

### model evaluation

- plot training and validation accuracy

In [None]:
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training Accuracy')
plt.plot(epochs, val_acc, ':', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

- plot confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set()

y_predicted = model.predict(test_features)
mat = confusion_matrix(y_test_encoded.argmax(axis=1), y_predicted.argmax(axis=1))
class_labels = ['IVR', 'Music', 'Silence', 'Speech']

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

- print accuracy, recall, F1 score, precision

In [None]:
from sklearn.metrics import classification_report
report =classification_report(y_test_encoded.argmax(axis=1), y_predicted.argmax(axis=1))
print(report)