In [None]:
# Written by Gabriel Sarch
# gabrielsarch@gmail.com
#
# Last edited: 4/20/2020

In [1]:
# Real-time audio detector 

In [6]:
# Function for processing data chunks
# 1) Normalize between -1 and 1
# 2) Bandpass 50-5000 Hz 
# 3) Extract MFCCs (40 of them)

from scipy import signal
import librosa
import numpy as np

RATE = 22050
sos = signal.butter(5, [50, 5000], 'bandpass', fs=RATE, output='sos')

def get_mfccs(audio): 
   
    try:
        audio = 2*((audio-min(audio))/(max(audio)-min(audio)))-1

        #Filter
        audio = signal.sosfilt(sos, audio) 

        mfccs = librosa.feature.mfcc(y=audio, sr=RATE, n_mfcc=40)

    except Exception as e:
            print("Error extracting features")
            return None
     
    return mfccs

In [7]:
# Initialize pyaudio settings

import pyaudio
#import struct
#import time
#import librosa.display

RATE = 22050
CHUNK = RATE*3
FORMAT = pyaudio.paInt16
CHANNELS = 1

# Identify which input is the mic - use to change input_device_index below
p = pyaudio.PyAudio()

chosen_device_index = -1
for x in range(0,p.get_device_count()):
    info = p.get_device_info_by_index(x)
    print(p.get_device_info_by_index(x))
  #  if info["name"] == "Microphone (Realtek High Defini":
  #      chosen_device_index = info["index"]
  #      print("Chosen index: ", chosen_device_index)

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (Realtek High Defini', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek High Definiti', 'hostApi': 0, 'maxInputChann

In [8]:
# load model

from keras.models import load_model

modelSave = 'C:/Users/Gabe/Documents/SeniorDesign/CNNModels/siren_detector.h5' # location of the trained model
model = load_model(modelSave)

In [9]:
# Real time detection
# 1) reads in 3-second audio chunk (defined by CHUNK)
# 2) gets MFCCs
# 3) runs MFCCs through keras model to predict probabilities 
# 4) Determines whether siren was present based on threshold
p = pyaudio.PyAudio()
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input_device_index=1, # CHANGE
    input=True,
    output=True,
    frames_per_buffer=CHUNK
 )


import sounddevice as sd

num_rows = 40
num_columns = 130
num_channels = 1

#alldata = [] # Used to play back audio for testing 

prob_thresh = 0.98 # probability threshold for detecting the siren

while True:
#for i in range(0,5):
    data = stream.read(CHUNK)
    data_int = np.frombuffer(data, dtype=np.int16)
    
    #alldata = np.append(alldata, data_int) 
    
    data_int = get_mfccs(data_int)

    prediction_feature = data_int.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_proba_vector = model.predict_proba(prediction_feature)

    #print(predicted_vector)
    
    print("not_siren: ", predicted_proba_vector[0][0], "siren: ", predicted_proba_vector[0][1])
    
    if predicted_proba_vector[0][1] > prob_thresh:
        print('SIREN!!!')
    else:
        print('No siren. Carry on.')


siren:  0.99995244 not_siren:  4.7510395e-05
No siren. Carry on.
siren:  0.9999832 not_siren:  1.6856056e-05
No siren. Carry on.
siren:  0.9999825 not_siren:  1.7528198e-05
No siren. Carry on.
siren:  0.99998975 not_siren:  1.0228796e-05
No siren. Carry on.
siren:  0.99999535 not_siren:  4.5939933e-06
No siren. Carry on.
siren:  0.99997973 not_siren:  2.0312254e-05
No siren. Carry on.
siren:  0.9999459 not_siren:  5.4138043e-05
No siren. Carry on.
siren:  0.13987076 not_siren:  0.86012924
No siren. Carry on.
siren:  0.9999646 not_siren:  3.541767e-05
No siren. Carry on.
siren:  0.99998474 not_siren:  1.523163e-05
No siren. Carry on.
siren:  0.9385345 not_siren:  0.061465506
No siren. Carry on.
siren:  0.9931117 not_siren:  0.006888238
No siren. Carry on.
siren:  0.9909644 not_siren:  0.009035534
No siren. Carry on.


KeyboardInterrupt: 

In [12]:
# play back recorded audio for testing purposes

import sounddevice as sd
sd.play(alldata, RATE)

