- Imports

In [6]:
import matplotlib.pyplot as plt
import pandas as pd
import os, tempfile, librosa, pyaudio
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from PIL import Image
import soundfile as sf
import tensorflow as tf
import pathlib
from scipy import signal
import keras.utils as image
from pydub import AudioSegment
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.applications import MobileNetV2, VGG19
# import sounddevice as sd
import time

- Time taken to load model

In [None]:
start = time.time()

model = tf.keras.models.load_model('ann_mobilenetv2_1_sec.h5')
base_model = MobileNetV2(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
if not os.path.exists(os.path.join('temp')):
    os.mkdir(os.path.join('temp'))

end = time.time()
print('load the model',f'>>>>{round((end-start),4)}')

- Load ground truth labels from CSV

In [None]:
fd = pd.read_csv(r'actual_labels.csv')
asd =fd['Actual']
asd =asd.to_list()
asd

- This function calculates and returns time for operations like:

    - convert audio to spectrogram and save it
    - load the saved image from disk
    - extract features from the image using mobilenetv2
    - Prediction by the model
    - total time taken for execution

In [6]:
def process_audio(aud_data, sr,count):
    print(f'>>>>>>>>>>>>>>>>> for the {count} chunk >>>>>>>>>>>>>>>>>')

#     play the audio
#     sd.play(aud_data, sr)
#     sd.wait()
#     time.sleep(1.5)

    s = time.time()
    
    #convert the audio sample into the spectogram
    start = time.time()
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
    ms = librosa.feature.melspectrogram(y=aud_data, sr=8000)
    log_ms = librosa.power_to_db(ms, ref=np.max)
    librosa.display.specshow(log_ms, sr=RATE)
    fig.savefig(f'temp/temp.png')
    end = time.time()
    first =round((end-start),4)
    print('To convert audio sample into the spectogram',f'>>>>{round((end-start),4)}')
    
    #convert the image into the array
    start = time.time()
    images = image.img_to_array(image.load_img('temp/temp.png', target_size=(224, 224)))
    os.remove('temp/temp.png')
    x = np.expand_dims(images, axis=0)
    end = time.time()
    second =round((end-start),4)
    print('To convert spectogram into the array',f'>>>>{round((end-start),4)}')
    
    #extract the feature from the image using mobilnetv2
    start = time.time()
    x = preprocess_input(np.array(x))
    y = base_model.predict(x, verbose=False)
    end = time.time()
    Third =round((end-start),4)
    print('extract the feature from array',f'>>>>{round((end-start),4)}')
    
    #Here we predict the result from model
    start = time.time()
    predictions = model.predict(y, verbose=False)
    res = np.argmax(predictions)
    print(class_labels[res])
    end = time.time()
    fourth =round((end-start),4)
    print('model prediction timing',f'>>>>{round((end-start),4)}')
    
    plt.close('all')
    e = time.time()
    fifth =round((e-s),4)
    print('time taken by the whole code',f'>>>>{round((e-s),4)}')
    
    return class_labels[res],first,second,Third,fourth,fifth

- load the audio file
- take 2 second chunk and get the time intervals for operations
- append the results in a list
- repeat step 2 and 3 till the end of audio

In [None]:
class_labels = ['IVR', 'Music', 'Speech']
audio_file=r'test_1sec.wav'
RATE = 8000
lst =[]
count =1
c=0
time_chunk=2

audio_data, sr = librosa.load(audio_file,sr =RATE)
while True: 
    r = RATE * time_chunk
    if len(audio_data)>r:
        audio_data1 = audio_data[:r]
        audio_data =audio_data[r:]
        res,first,second,Third,fourth,fifth =process_audio(audio_data1, sr,count)
        lst.append([asd[c],res,first,second,Third,fourth,fifth])
        count+=1
        c+=1
    else:
        res,first,second,Third,fourth,fifth =process_audio(audio_data, sr,count)
        lst.append([asd[c],res,first,second,Third,fourth,fifth])
        break

- convert the list to dataframe with column names

In [14]:
import pandas as pd
data = pd.DataFrame(lst,columns =['Actual','predict','audio to spectogram','spectogram to array','extract the feature','model prediction timing','whole time-taken'])

In [None]:
data

- export the dataframe results to CSV

In [16]:
data.to_csv('mix_1sec.csv',index =False)