In [12]:
import sounddevice as sd
import numpy as np
import tensorflow as tf
import librosa


def compute_mfcc(y, sr):
    # Check if audio length is greater than or equal to 10 seconds
    if len(y) >= sr * 10:
        # Clip the audio to 10 seconds
        y = y[:sr * 10]
    else:
        # If audio length is less than 10 seconds, zero-pad at the end
        zero_padding = np.zeros(sr * 10 - len(y), dtype=np.float32)
        y = np.concatenate([y, zero_padding])
        
    # Extract MFCC features for the entire audio
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    
    return mfccs.T  # Transpose the MFCC matrix to have time along the rows and features along the columns


# Define parameters for audio recording
RATE = 16000
DURATION = 10  # Record for 10 seconds

# Record audio for 10 seconds
print("Recording...")
audio_data = sd.rec(int(DURATION * RATE), samplerate=RATE, channels=1, dtype=np.float32)
sd.wait()
print("Finished recording.")

# Preprocess the recorded audio
y = audio_data[:, 0]  # Extract mono audio data
sr = RATE
mfcc = compute_mfcc(y, sr)

# Reshape the MFCC array to match the input shape expected by the TFLite model
mfcc_input = mfcc.reshape(1, mfcc.shape[0], mfcc.shape[1])

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path="custom_LSTM_model.tflite")
interpreter.allocate_tensors()

# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Set input tensor
interpreter.set_tensor(input_details[0]['index'], mfcc_input)

# Run inference
interpreter.invoke()

# Get the output tensor
output_data = interpreter.get_tensor(output_details[0]['index'])

# Get the predicted class ID
predicted_class_id = np.argmax(output_data)

# Define class mapping
class_mapping = {
    0: 'Clicking',
    1: 'Computer_keyboard',
    2: 'Cough',
    3: 'Female_speech,_woman_speaking',
    4: 'Hair_dryer',
    5: 'Laughter',
    6: 'Male_speech,_man_speaking',
    7: 'Silence',
    8: 'Sneeze',
    9: 'Vacuum_cleaner',
    10: 'book_page_flip'
}

# Map the predicted class ID to its label
predicted_class_label = class_mapping[predicted_class_id]

print("Predicted class:", predicted_class_label)


Recording...
Finished recording.
Predicted class: Computer_keyboard


In [1]:
import sounddevice as sd
import numpy as np
import tensorflow as tf
import librosa
from collections import deque
import time

def compute_mfcc(y, sr):
    # Extract MFCC features for the entire audio
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    return mfccs.T  # Transpose the MFCC matrix

# Define parameters for audio recording
RATE = 16000
SEGMENT_DURATION = 1  # Duration of each audio segment in seconds
TOTAL_DURATION = 10  # Total duration of audio to maintain in the buffer, in seconds
SEGMENT_SAMPLES = RATE * SEGMENT_DURATION  # Number of samples in each segment
BUFFER_SAMPLES = RATE * TOTAL_DURATION  # Total number of samples in the buffer

# Initialize a deque as a ring buffer to store audio data
audio_buffer = deque(maxlen=BUFFER_SAMPLES)

# Initialize the TFLite model
interpreter = tf.lite.Interpreter(model_path="custom_LSTM_model.tflite")
interpreter.allocate_tensors()

# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

def inference_from_buffer(buffer, rate):
    # Convert the buffer to a numpy array
    audio_data = np.array(buffer)
    # Compute MFCC
    mfcc = compute_mfcc(audio_data, rate)
    # Reshape the MFCC for the model
    mfcc_input = mfcc.reshape(1, mfcc.shape[0], mfcc.shape[1])
    # Set input tensor
    interpreter.set_tensor(input_details[0]['index'], mfcc_input)
    # Run inference
    interpreter.invoke()
    # Get the output tensor
    output_data = interpreter.get_tensor(output_details[0]['index'])
    # Get the predicted class ID
    return np.argmax(output_data)

class_mapping = {
    0: 'Clicking',
    1: 'Computer_keyboard',
    2: 'Cough',
    3: 'Female_speech,_woman_speaking',
    4: 'Hair_dryer',
    5: 'Laughter',
    6: 'Male_speech,_man_speaking',
    7: 'Silence',
    8: 'Sneeze',
    9: 'Vacuum_cleaner',
    10: 'book_page_flip'
}
    

# Callback function to process each block of audio
def audio_callback(indata, frames, time, status):
    # Flatten and append new audio data to the buffer
    audio_buffer.extend(indata[:, 0])

# Start recording and processing
print("Recording and processing in real-time. Press Ctrl+C to stop...")
with sd.InputStream(samplerate=RATE, channels=1, dtype=np.float32, callback=audio_callback):
    # Zero-pad the initial buffer if necessary
    while len(audio_buffer) < BUFFER_SAMPLES:
        time.sleep(SEGMENT_DURATION)
        print("Gathering initial audio...")

    # Perform the first inference
    predicted_class_id = inference_from_buffer(audio_buffer, RATE)
    print("Initial Predicted class:", class_mapping[predicted_class_id])

    # Now, keep processing in real-time
    while True:
        # Wait for 1 second worth of new data
        time.sleep(SEGMENT_DURATION)
        # Perform inference
        predicted_class_id = inference_from_buffer(audio_buffer, RATE)
        print("Predicted class:", class_mapping[predicted_class_id])

Recording and processing in real-time. Press Ctrl+C to stop...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Gathering initial audio...
Initial Predicted class: Computer_keyboard
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Computer_keyboard
Predicted class: Computer_keyboard
Predicted class: Computer_keyboard
Predicted class: Computer_keyboard
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Vacuum_cleaner
Predicted class: Male_speech,_man_speaking
Predicted class: Cough
Predicted class: Male_speech,_man_speaking
Predicted class: Male_speech,_man_speaking
Predicted class: Cough
Predicted class: Male_speech,_man_speaking
Predicted 

KeyboardInterrupt: 