# IMPORTING LIBRARIES

In [1]:
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import IPython.display as ipd
from scipy import signal
import pyaudio
import time
import wave

from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import backend as K
import tensorflow as tf

# LOAD AND PROCESS DATA

In [2]:
emergency, sample_rate = librosa.load('emergency.wav', sr = 16000)
non_emergency, sample_rate= librosa.load('non emergency.wav', sr =16000)

In [3]:
# BREAKING AUDIO INTO 2 SEC CHUNKS
def prepare_data(samples, num_of_samples = 32000, num_of_common = 16000):
  
  data = []
  for offset in range(0, len(samples), num_of_common):
    start = offset
    end   = offset + num_of_samples
    chunk = samples[start:end]
    
    if(len(chunk) == 32000):
      data.append(chunk)
    
  return data

In [4]:
emergency = prepare_data(emergency)
non_emergency = prepare_data(non_emergency)

In [5]:
# COMBINE DATA
audio = np.concatenate([emergency,non_emergency])

In [6]:
# CREATING LABELS
labels1 = np.ones(len(emergency))
labels2 = np.zeros(len(non_emergency))
labels = np.concatenate([labels1,labels2])

In [7]:
x_tr, x_val, y_tr, y_val = train_test_split(np.array(audio), np.array(labels), stratify = labels, test_size = 0.1, random_state = 777, shuffle = True)

In [8]:
def log_specgram(audio, sample_rate, eps=1e-10):
    freqs, times, spec = signal.spectrogram(audio, fs = sample_rate, nperseg = 320, noverlap = 160, detrend = False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [9]:
# EXTRACT SPECTROGRAM FEATURES
def extract_spectrogram_features(x_tr):
  features = []
  for i in x_tr:
    _, _, spectrogram = log_specgram(i, sample_rate)
    
    mean = np.mean(spectrogram, axis = 0)
    std = np.std(spectrogram, axis = 0)
    spectrogram = (spectrogram - mean) / std
    
    features.append(spectrogram)

  return np.array(features)

In [10]:
x_tr_features  = extract_spectrogram_features(x_tr)
x_val_features = extract_spectrogram_features(x_val)

In [11]:
def lstm(x_tr):
    K.clear_session()
    inputs = Input(shape=(x_tr.shape[1], x_tr.shape[2]))
    x = LSTM(128)(inputs)
    x = Dropout(0.3)(x)
    x = Dense(64, activation = 'relu')(x)
    x = Dense(1, activation = 'sigmoid')(x)
    model = Model(inputs, x)
    return model

In [12]:
model = lstm(x_tr_features)

In [13]:
model.load_weights('SPEC_LSTM/best_spec_lstm_model_10.hdf5')

In [14]:
def predict(audio):
    ipd.Audio(audio, rate = 16000)
    feature = extract_spectrogram_features([audio])
    prob = model.predict(feature)
    if (prob[0][0] < 0.5 ):
        pred = 'non-emergency vehicle keep signal as is'
    else:
        pred = 'emergency vehicle make signal green' 
    print("Prediction:",pred)

In [15]:
ipd.Audio(x_val[0], rate = 16000)

In [16]:
predict(x_tr[0])

Prediction: non-emergency vehicle keep signal as is


In [17]:
ipd.Audio(x_val[35], rate = 16000)

In [18]:
predict(x_tr[35])

Prediction: emergency vehicle make signal green


# INFERENCE

In [19]:
# DECLARING CONSTANTS
chunk = 1024
channels = 1
sample_rate = 16000
record_duration = 2
audio_format = pyaudio.paInt16
threshold = 500

In [20]:
pa = pyaudio.PyAudio()

stream = pa.open(
    format = audio_format,
    channels = channels,
    rate = sample_rate,
    input = True,
    frames_per_buffer = chunk,
)

In [21]:
# CHECKING IF AUDIO IS MORE THAN THRESHOLD
def detect_voice(frame):
    avg_value = np.average(np.abs(frame))
    if avg_value > threshold:
        return True
    else:
        return False

In [22]:
# WRITE EACH RECORDING TO FILE
def write_to_file(recording):
    filename = 'temp.wav'
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(pa.get_sample_size(audio_format))
    wf.setframerate(sample_rate)
    wf.writeframes(recording)
    wf.close()
    return filename

In [23]:
def record():
    print('Voice detected - begin to record')
    waveform = []
    current = time.time()
    end = time.time() + record_duration
    while current <= end:
        data = stream.read(chunk)
        waveform.append(data[:199])
        current = time.time()
    filename = write_to_file(b''.join(waveform))
    waveform, sample_rate = librosa.load(filename, 16000)
    predict(waveform)
    print('Return to listening\n\n')

In [25]:
print('Listening ...')
while True:
    frame = np.frombuffer(stream.read(chunk), dtype=np.int16)
    if detect_voice(frame):
        record()

Listening ...
Voice detected - begin to record
Prediction: non-emergency vehicle keep signal as is
Return to listening


Voice detected - begin to record
Prediction: emergency vehicle make signal green
Return to listening


Voice detected - begin to record
Prediction: non-emergency vehicle keep signal as is
Return to listening


Voice detected - begin to record
Prediction: emergency vehicle make signal green
Return to listening


Voice detected - begin to record
Prediction: emergency vehicle make signal green
Return to listening


Voice detected - begin to record
Prediction: non-emergency vehicle keep signal as is
Return to listening


Voice detected - begin to record
Prediction: non-emergency vehicle keep signal as is
Return to listening


Voice detected - begin to record
Prediction: non-emergency vehicle keep signal as is
Return to listening


Voice detected - begin to record
Prediction: emergency vehicle make signal green
Return to listening




KeyboardInterrupt: 