In [1]:
"""
This file can be used to try a live prediction. 
"""

import keras
import numpy as np
import librosa
import pyaudio
import wave
import keyboard

class livePredictions:
    """
    Main class of the application.
    """

    # def __init__(self, path, file):
    def __init__(self, path):
        """
        Init method is used to initialize the main parameters.
        """
        self.path = path
        # self.file = file

    def load_model(self):
        """
        Method to load the chosen model.
        :param path: path to your h5 model.
        :return: summary of the model with the .summary() function.
        """
        self.loaded_model = keras.models.load_model(self.path)
        return self.loaded_model.summary()
    
    def record_audio(self, duration=5, filename='recorded_audio.wav'):
        """
        Method to record audio from the microphone.
        :param duration: duration of the recording in seconds.
        :param filename: name of the recorded audio file.
        """
        audio = pyaudio.PyAudio()

        # Set the audio parameters
        format = pyaudio.paInt16
        channels = 1
        rate = 44100
        frames_per_buffer = 1024

        # Create an audio stream
        stream = audio.open(format=format, channels=channels,
                            rate=rate, input=True,
                            frames_per_buffer=frames_per_buffer)

        print("Recording...Press the 'ESC' key to stop.")
        frames = []

        # for _ in range(0, int(rate / frames_per_buffer * duration)):
        #     data = stream.read(frames_per_buffer)
        #     frames.append(data)

        # print("Finished recording.")

        # # Stop and close the audio stream
        # stream.stop_stream()
        # stream.close()
        # audio.terminate()

        # # Save the recorded audio to a WAV file
        # with wave.open(filename, 'wb') as wf:
        #     wf.setnchannels(channels)
        #     wf.setsampwidth(audio.get_sample_size(format))
        #     wf.setframerate(rate)
        #     wf.writeframes(b''.join(frames))

        # return filename
        
        while True:
            data = stream.read(frames_per_buffer)
            frames.append(data)
            
            # Check if the 'ESC' key is pressed to stop recording
            if keyboard.is_pressed('esc'):
                print("Stopped recording.")
                break

        # Stop and close the audio stream
        stream.stop_stream()
        stream.close()
        audio.terminate()

        # Save the recorded audio to a WAV file
        with wave.open(filename, 'wb') as wf:
            wf.setnchannels(channels)
            wf.setsampwidth(audio.get_sample_size(format))
            wf.setframerate(rate)
            wf.writeframes(b''.join(frames))

        return filename

    def makepredictions(self, audio_file):
        """
        Method to process the files and create your features.
        """
        data, sampling_rate = librosa.load(audio_file)
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=40).T, axis=0)
        x = np.expand_dims(mfccs, axis=1)
        x = np.expand_dims(x, axis=0)
        predictions = self.loaded_model.predict(x)
        predicted_class_index = np.argmax(predictions)
        print("Prediction is", " ", self.convertclasstoemotion(predicted_class_index))

    @staticmethod
    def convertclasstoemotion(pred):
        """
        Method to convert the predictions (int) into human readable strings.
        """
        
        label_conversion = {'0': 'neutral',
                            '1': 'calm',
                            '2': 'happy',
                            '3': 'sad',
                            '4': 'frustration',
                            '5': 'fearful',
                            '6': 'disgust',
                            '7': 'surprised'}

        for key, value in label_conversion.items():
            if int(key) == pred:
                label = value
        return label
        # return label_conversion.get(pred,'Unknown')

# Here you can replace path and file with the path of your model and of the file 
#from the RAVDESS dataset you want to use for the prediction,
# Below, I have used a neutral file: the prediction made is neutral.

pred = livePredictions(path='SER_model.h5')
# pred = livePredictions(path='SER_model.h5',file=r'C:\Users\a21ma\OneDrive\Desktop\HackX\Audio\Speech-Emotion-Recognition-using-ML-and-DL\examples\10-16-07-29-82-30-63.wav')

pred.load_model()
audio_file = pred.record_audio(duration=5, filename='recorded_audio.wav')
pred.makepredictions(audio_file)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 40, 64)            384       
                                                                 
 activation_1 (Activation)   (None, 40, 64)            0         
                                                                 
 dropout_1 (Dropout)         (None, 40, 64)            0         
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 10, 64)            0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 10, 128)           41088     
                                                                 
 activation_2 (Activation)   (None, 10, 128)           0         
                                                      