In [1]:
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
import time
import librosa
from sklearn.preprocessing import MinMaxScaler
from keras.models import load_model

In [2]:
classes = ['screaming'
           #, 'whispering'
           #, 'music'
           #, 'water'
           #, 'wind'
           #, 'vehicle'
           #, 'tools'
           , 'Subway, metro, underground'
           #, 'alarm'
           #, 'silence'
           #, 'animal'
           , 'speech'
           #, 'television'
           , 'Outside, rural or natural'
           , 'Traffic noise, roadway noise'
           , 'Inside, small room'
           #, 'radio'
           , 'Outside, urban or manmade'
           , 'singing'
           #, 'Inside, large room or hall'
          ]

In [3]:
def record_sound():
    fs = 44000  # Sample rate
    seconds = 2  # Duration of recording

    # Create countdown for t seconds to prep to record
    print('get ready to record: ')
    t = 2
    while t:
        mins, secs = divmod(t, 60)
        timer = '{:02d}:{:02d}'.format(mins, secs)
        print(timer, end="\r")
        time.sleep(1)
        t -= 1
        
    # Record sound
    print('record now for 2 seconds!')
    myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    print('recording done.')
    
    #write('output.wav', fs, myrecording)  # dont know where this goes
    print('Playing recording back: ')
    sd.play(myrecording, fs)
    return myrecording

In [13]:
def preprocess_sound(myrecording):
    # Flatten
    y = myrecording.flatten()
    # Grab melspectrogram
    mel_feat = librosa.feature.melspectrogram(y,sr=44000)
    # Convert a power spectrogram (amplitude squared) to decibel (dB) units
    power = librosa.power_to_db(mel_feat)
    power = power.reshape(-1,1)
    # Grab 1 part of dB units
    melspectrogram_feat = power[:11776]
    melspectrogram_feat
    # Scale input between -1 and 1 
    scaler = MinMaxScaler(feature_range=(0, 1))
    melspectrogram_feat = np.array(melspectrogram_feat).reshape(-1, 1)
    #print(melspectrogram_feat)
    scaler.fit(melspectrogram_feat)
    norm_mel = scaler.transform(melspectrogram_feat)
    
    # Reshape 
    feat = np.reshape(norm_mel, (1, 128, -1,1))

    return feat

In [22]:
def predict_class(feat): 
    # Predict class probabilities
    model = load_model('my_model_opt.h5')
    y_prob = model.predict(feat)
    # Grab highest confidence class
    y_classes = y_prob.argmax(axis=-1)
    print("Sound prediction: ", classes[y_classes[0]])
    return classes[y_classes[0]]

In [39]:
myrecording = record_sound()
feat = preprocess_sound(myrecording)
pred = predict_class(feat)
print(pred)

get ready to record: 
record now for 2 seconds!
recording done.
Playing recording back: 
Sound prediction:  Inside, small room
Inside, small room


In [40]:
myrecording

array([[-0.00996824],
       [-0.00922157],
       [-0.01826992],
       ...,
       [-0.01083764],
       [-0.00964471],
       [-0.00822937]], dtype=float32)