In [1]:
import tensorflow as tf
from keras.models import Sequential,Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D

In [2]:
import numpy as np
import sounddevice as sd
import time
import signal
import sys
import librosa
import pyautogui

# Parameters
SAMPLE_RATE = 22050
CHUNK_DURATION = 0.8
CHUNK_SIZE = int(SAMPLE_RATE * CHUNK_DURATION)
ENERGY_THRESHOLD = 0.01

# Global flag to control recording
recording = True


def Model():
    model = Sequential()
    model.add(Conv2D(32,kernel_size=3,activation='relu',input_shape=(15,35,1)))
    model.add(MaxPool2D())
    model.add(Conv2D(64,kernel_size=3,activation='relu'))
    model.add(MaxPool2D())
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dropout(0.4))
    model.add(Dense(5,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.load_weights('audio_processing2.weights.h5')
    return model

model = Model()


def processing(audio):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=22050, n_fft=1048, hop_length=512, n_mels=15)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    inputt = np.array(log_mel_spectrogram).reshape(1,15,35,1)
    print(inputt.shape)
    pred = model.predict(inputt)
    outputt = np.argmax(pred,axis=1)
    if outputt == 0:
        pyautogui.press('up')
        print('Jump')
    elif outputt == 1:
        pyautogui.press('down')
        print('Down')
    elif outputt == 2:
        pyautogui.press('left')
        print('Left')
    elif outputt == 3:
        pyautogui.press('right')
        print('Right')
    elif outputt == 4:
        print('Start')
        pyautogui.press('enter')
    

def is_speech(audio_data):
    # Compute the root mean square (RMS) energy of the audio data
    rms = np.sqrt(np.mean(np.square(audio_data)))
    print(f"RMS Energy: {rms}")

    # Determine if speech is present, based on RMS energy
    return rms > ENERGY_THRESHOLD

    
def callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    # Process audio data here
    audio_data = indata[:, 0]
    if is_speech(audio_data):
        processing(audio_data)
    else:
        print('Speech not Detected')

def signal_handler(sig, frame):
    global recording
    print("\nStopping recording...")
    recording = False

signal.signal(signal.SIGINT, signal_handler)


global recording

with sd.InputStream(samplerate=SAMPLE_RATE, channels=2, callback=callback, blocksize=CHUNK_SIZE) as stream:
    print("Recording started. Press Ctrl+C to stop...")
    while recording:
        time.sleep(CHUNK_DURATION)  # Sleep for the duration of each chunk to keep recording
    print("Recording stopped.")



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


Recording started. Press Ctrl+C to stop...
RMS Energy: 0.026231674477458
(1, 15, 35, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Left
RMS Energy: 0.0038298615254461765
Speech not Detected
RMS Energy: 0.003756255144253373
Speech not Detected
RMS Energy: 0.004072780255228281
Speech not Detected
RMS Energy: 0.14202678203582764
(1, 15, 35, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
Right
RMS Energy: 0.002732691587880254
Speech not Detected
RMS Energy: 0.0028137327171862125
Speech not Detected
RMS Energy: 0.0027411894407123327
Speech not Detected
RMS Energy: 0.0026794273871928453
Speech not Detected
RMS Energy: 0.10489621758460999
(1, 15, 35, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Right
RMS Energy: 0.0033203447237610817
Speech not Detected
RMS Energy: 0.003186509944498539
Speech not Detected
RMS Energy: 0.00341261038556695
Speech not Detected
RMS Energy: 0.0038403894286602736
Speech not Detecte