Here, ANN model is trained on mel feature calculated using librosa library. 
Mel feature contains both frequency and time aspects of the signal.

### These are all imports required in this script

In [1]:
import librosa
import matplotlib.pyplot as plt
from scipy.io import wavfile as wav
import pandas as pd
import os, pathlib, sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Conv2D, MaxPooling2D,Flatten, Dense, Dropout
from keras.layers import Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
import scipy
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime
import seaborn as sns
from keras.models import load_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
# from sklearn.metrics import multilabel_confusion_matrix
from IPython import display
# from livelossplot import PlotLossesKeras
import glob
from sklearn.metrics import confusion_matrix
import seaborn as sns

### Place your dataset in the datas directory in the following format:

- Copy the dataset from data directory
- data contains data that is of 2/3/5 seconds interval. Choose the dataset of particular interval of interest.
- E.g. if you want to train the model on 2 seconds interval data, copy directories from 'data_2_sec' and place it in datas folder in following format:

datas

    |-IVR
        |-ivr.wav
        |-ivr1.wav
        |-ivr2.wav
        |.........

    |-Music
        |-music.wav
        |-music1.wav
        |-music2.wav
        |.........

    |-Speech
        |-speech.wav
        |-speech2.wav
        |-speech3.wav
        |.........

### here we get the root directory and join it with the dataset directory

In [None]:
base_dir = pathlib.Path(__name__).parent.absolute()
directory_path = os.path.join(base_dir,'datas')
directory_path

### feature extraction (extract mel feature)
- here, first iteration is performed on the subdirectories in the datas so as to get the labels of the files
- in the second iteration, all the files are processed in that subdirectory
- if file ends with '.wav' it is loaded using librosa at sample rate 8000 and then melspectrogram is calculated
- feature and label are added to the lists respectively

In [None]:
feature_data = []
labels = []
for folder_name in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder_name)
    if os.path.isdir(folder_path):
        
        label = folder_name
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            if file_name.lower().endswith('.wav'):
                audio, sr = librosa.load(file_path, sr=8000)
                mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr )
                if mel_spectrogram.shape != (128,79):
                    print(mel_spectrogram.shape,file_path)
                else:
                    feature_data.append(mel_spectrogram)
                    labels.append(label)

In [6]:
feature_data =np.stack(feature_data)

#### Label Encoding

In [8]:
label=np.array(pd.get_dummies(labels))
label = label.astype(int)

#### train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, label
, test_size=0.33, random_state=42)

### model creation & training
- ANN (sequential) model is used
- optimizer is adam, loss used: categorical cross entropy as more than 2 classes used
- early stopping (of patience 3) to stop training if accuracy stagnates

In [12]:
model = Sequential()
model.add(Flatten(input_shape=X_train[0].shape))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))  
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3)) 
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))  
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))  
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

- model is trained on 30 epochs, batch size of 32

In [13]:
hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

In [14]:
model.save('ann_statistical_mel.h5')

### model evaluation
- model is evaluated by plotting train, validation accuracy

In [15]:
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training Accuracy')
plt.plot(epochs, val_acc, ':', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

- model predicts the labels from X_test and then it is compared with the actual labels
- a confusion matrix is plotted using seaborn to output the model's accuracy

In [16]:
sns.set()

y_predicted = model.predict(X_test)

mat = confusion_matrix(y_test.argmax(axis=1), y_predicted.argmax(axis=1))
class_labels = ['IVR', 'Music', 'Speech']

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)

plt.xlabel('Predicted label')
plt.ylabel('Actual label')