In [1]:
# python library imports
import librosa
import librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os, sys, re, pickle, glob
from PIL import Image
import pathlib
import csv 
# sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import keras
from keras import layers
from keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')
import IPython.display as ipd

**Visualization of the Audio Files:**
In this section, we have read and plotted the MileEnd audio segments for better understanding of the samples.

In [None]:
cmap = plt.get_cmap('inferno')
plt.figure(figsize=(8,8))
files = glob.glob('/content/drive/MyDrive/QMUL/ML/Project/Data/MLEnd/training/*/*.wav')

intonations = 'neutral bored excited question'.split()
for filename in files[:5]:        
    y, sr = librosa.load(filename, mono=True, duration=5)
    plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
    plt.axis('off');
    plt.savefig(f'/content/drive/MyDrive/QMUL/ML/Project/Data/MLEnd/img_data/{os.path.basename(filename).replace(".wav", "")}.png')
    plt.show()

    # plt.figure(figsize=(14, 5))
    # librosa.display.waveplot(y, sr=sr)

    plt.clf()

**Data Preprocessing:**
Arranging all the feature header in this section.

In [2]:
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

print(header)

['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'label']


**Labels Processing:**
Loading all the labels into a variable to process and feed into our model.

In [4]:
labels = pd.read_csv('trainingMLEnd.csv')
labels['digit_label']

0         4
1         2
2        70
3         2
4         4
         ..
19995    90
19996    10
19997    90
19998    19
19999    20
Name: digit_label, Length: 20000, dtype: int64

In [5]:
labels['File ID']

0        0000000.wav
1        0000001.wav
2        0000002.wav
3        0000003.wav
4        0000004.wav
            ...     
19995    0019995.wav
19996    0019996.wav
19997    0019997.wav
19998    0019998.wav
19999    0019999.wav
Name: File ID, Length: 20000, dtype: object

**Feature Extraction:**
Extracting features from first 5000 audio files and saving them into a CSV file which in turn will be used as the input of our model.

In [None]:
new_dataset_path = '/content/drive/MyDrive/QMUL/ML/Project/Data/MLEnd/new_dataset.csv'
file = open(new_dataset_path, 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)

digit_idx = 0
for filename in labels['File ID'][:5000]:
# for filename in files[:50]:    
    filename = f'/content/drive/MyDrive/QMUL/ML/Project/Data/MLEnd/training/Training/{filename}'
    y, sr = librosa.load(filename, mono=True, duration=30)
    digit_label = labels['digit_label'][digit_idx]
    digit_idx += 1
    # print(filename, '=>', digit_label)
    if (digit_label) > 9:
      continue

    # rmse = librosa.feature.rmse(y=y)
    rmse = librosa.feature.rms(y=y)[0]
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    filename2 = os.path.basename(filename)
    to_append = f'{filename2} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc:
        to_append += f' {np.mean(e)}'
    to_append += f' {digit_label}'
    file = open(new_dataset_path, 'a', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(to_append.split())     

**Visualizing Feature Data:**
Here we are visualing the features that were saved into a CSV file.

In [None]:
data = pd.read_csv(new_dataset_path)
data

**Feature Selection and Preparing Training & Test Data:**
In this section we remove all unncessary data from the feature CSV, transform them and split them to achieve Training & Test datasets.

In [None]:
data.head() # Dropping unneccesary columns
data = data.drop(['filename'],axis=1) # Encoding the Labels
digit_list = data.iloc[:, -1]
print(digit_list)
encoder = LabelEncoder()
y = encoder.fit_transform(digit_list) # Scaling the Feature columns
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float)) # Dividing data into training and Testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Visualizing the Features**

In [None]:
data.iloc[:, :-1]

In [None]:
np.array(data.iloc[:, :-1], dtype = float)[0]

In [None]:
X_train

**Checking the label data**

In [None]:
y

**Building the Model:**
Here we have built our model. This is an ANN model with activatation parameters in several layers.

In [None]:
model = Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

**Model Summary**

In [None]:
model.summary()

**Model Fitting and Classification:**
In this phase, we have fitted the model with training data for a number of 150 epochs with batch size of 128.

In [None]:
classifier = model.fit(X_train,
                    y_train,
                    epochs=150,
                    batch_size=128)

**Checking Accuracy Data**

In [None]:
train_accuracy=model.evaluate(X_train,y_train,verbose=0)
print(train_accuracy[1])

In [None]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

In [None]:
X_test[1]

**Visualization with Confusion Matrix**

In [None]:
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
import seaborn as sns

yt_p = model.predict(X_train)
print(y_train)
print(yt_p)
train_confusion_matrix = confusion_matrix(y_true=y_train, y_pred=yt_p)
print('Training confusion matrix:\n {}\n'.format(train_confusion_matrix))

# confusion matrix
y_pred = yt_p
cf_matrix = confusion_matrix(y_train, y_pred)
sns.heatmap(cf_matrix, annot=True)

**Testing the Model:**
We chose a random audio file from the MileEnd audio dataset and verified whether our model could predit the correct number.

In [None]:
filename = '0008039.wav' # digit file
filename = f'/content/drive/MyDrive/QMUL/ML/Project/Data/MLEnd/training/Training/{filename}'
y, sr = librosa.load(filename, mono=True, duration=30)

rmse = librosa.feature.rms(y=y)[0]
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
zcr = librosa.feature.zero_crossing_rate(y)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
to_append = f'{np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
for e in mfcc:
    to_append += f' {np.mean(e)}'

data = np.array(to_append.split(), dtype = float)
print(data)

**Reshaping the Test Input and Pulling the Prediction Values**

In [None]:
data_reshaped = data.reshape(1,-1)
predicted_label=model.predict_classes(data_reshaped)
print(predicted_label)

In [None]:
prediction_class = encoder.inverse_transform(predicted_label) 
prediction_class

**Conclusion:**

In this solution, we used the MLEnd dataset, we splitted it, we extracted features and trained the model. Our model was an Artificial Neural Network which gave us significant results. Finally, we proved by our test that the model can predict the correct numeral from any given MLEnd audio segment.