In [2]:
pip install sounddevice

Collecting sounddevice
  Downloading sounddevice-0.4.4-py3-none-win_amd64.whl (195 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.4.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install emoji

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py): started
  Building wheel for emoji (setup.py): finished with status 'done'
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171035 sha256=2de9656fceba645a4eb9b8598ce4ca48a8dd2353ec1a1507889d6c62c8fb25fa
  Stored in directory: c:\users\saide\appdata\local\pip\cache\wheels\5e\8c\80\c3646df8201ba6f5070297fe3779a4b70265d0bfd961c15302
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0
Note: you may need to restart the kernel to use updated packages.


<h1>Importing the Libraries</h1>

At first, let's import all the necessary Python libraries.

In [3]:
import librosa as rosa
import numpy as np
import tensorflow as tf
import sounddevice as sd
import emoji

<h1>Importing the Trained Model</h1>

In [4]:
# Load the RNN model from the h5 file 
rnn_h5 = tf.keras.models.load_model('RNN_RAVDESS.h5')

# Load the arrays containing means and standard deviations of features from training for the RNN model
mean_X = np.load('mean_X.npy')
std_X = np.load('std_X.npy')

<h1>Setting the Audio Parameters</h1>.

In [5]:
fs = 16000  # Record at 16000 samples per second
median_num_frames = 230  # From training data
seconds = 7.36  # Length of recording (230*512/16000)

<h1>Defining the String Labels</h1>

In [6]:
def change_label(argument):
    switcher = {
        1:"Neutral",
        2:"Happy",
        3:"Sad",
        4:"Angry",
        5:"Fearful",
        6:"Disgust",
        7:"Surprised"
    }
    return switcher.get(argument, "Nothing")

def label_emoji(argument):
    switcher = {
        1:emoji.emojize(":neutral_face:"),
        2:emoji.emojize(":grinning_face_with_smiling_eyes:"),
        3:emoji.emojize(":disappointed_face:"),
        4:emoji.emojize(":angry_face:"),
        5:emoji.emojize(":fearful_face:"),
        6:emoji.emojize(":face_vomiting:"),
        7:emoji.emojize(":hushed_face:"),
    }
    return switcher.get(argument, "Nothing")

<h1>Recording Audio and making Predictions</h1>

This is the cell which will perform the actual audio recordings.

In [8]:
print('Recording...')

try:
    while True:
        # In sounddevice, frames mean samples!
        # Blocksize is the number of samples per frame!

        # Store recorded signal into a Numpy array.
        sig = sd.rec(frames=int(fs*seconds), samplerate=fs, channels=1, blocksize=512)

        sd.wait() # Wait until recording is finished

        sig = np.reshape(sig, (117760,))    # 16000 Hz * 7.36 seconds


        # RNN feature extraction
        # 'rosa.feature.mfcc' extracts n_mfccs from signal and stores it into 'mfcc_feat'.
        mfcc_feat = rosa.feature.mfcc(y=sig, sr=fs, n_mfcc=26, n_fft=512, hop_length=256, htk=True)

        spec_feat = rosa.feature.spectral_contrast(y=sig, sr=fs, n_fft=512, hop_length=256)

        poly_feat = rosa.feature.poly_features(y=sig, sr=fs, n_fft=512, hop_length=256)

        rms_feat = rosa.feature.rms(y=sig, frame_length=512, hop_length=256)

        # Append the three 1D arrays into a single 1D array called 'feat'.
        feat0 = np.append(mfcc_feat, spec_feat, axis=0)

        feat1 = np.append(feat0, poly_feat, axis=0)

        feat2 = np.append(feat1, rms_feat, axis=0)
        
        # Transpose the array to flip the rows and columns. This is done so that the features become column parameters, making each row an audio frame.
        transp_feat = feat2.T

        # Note: The 'cap frame number' is basically the limit we set for the number of frames for each audio file, so that all audio files have equal lengths when processing.
        
        if transp_feat.shape[0] < median_num_frames:
            # If number of frames is smaller than the cap frame number, we pad the array in order to reach our desired dimensions.
            # Pad the array so that it matches the cap frame number. The second value in the argument contains two tuples which indicate which way to pad how much.  
            transp_feat = np.pad(transp_feat, ((0, median_num_frames-transp_feat.shape[0]), (0,0)), constant_values=0)

        elif transp_feat.shape[0] > median_num_frames:
            # If number of frames is larger than the cap frame number, we delete rows (frames) which exceed the cap frame number in order to reach our desired dimensions.
            # Define a tuple which contains the range of the row indices to delete.
            row_del_index = (range(median_num_frames, transp_feat.shape[0], 1))
            transp_feat = np.delete(transp_feat, row_del_index, axis=0)

        else:
            # If number of frames match the cap frame length, perfect!
            transp_feat = transp_feat

        # Transpose again to flip the rows and columns. This is done so that the features become row parameters, making each column an audio frame.
        transp2_feat = transp_feat.T

        # Flatten the entire 2D Numpy array into 1D Numpy array. So, the first 36 values of the 1D array represent the features for first frame, the second 36 represent the features for second frame, and so on till the final (cap) frame.
        # 'C' means row-major ordered flattening.
        feat_rnn = transp2_feat.flatten('C')

        feat_rnn = np.reshape(feat_rnn, (1,-1)) 

        # Standardize the inputs means and standard deviations of features from training for RNN model.
        feat_centered_rnn = (feat_rnn - mean_X)/std_X

        # Reshaping feat_centered to 3D Numpy array for feeding into the RNN. RNNs require 3D array input.
        # 3D dimensions are (layers, rows, columns).
        feat_3D = np.reshape(feat_centered_rnn, (feat_centered_rnn.shape[0], median_num_frames, 36))

        # Transpose tensors so that rows=features and columns=frames.
        feat_3D_posed = tf.transpose(feat_3D, perm=[0, 2, 1])

        # Make prediction using RNN model.
        pred = rnn_h5.predict(feat_3D_posed)

        # Convert One Hot label to integer label.
        pred = int(np.argmax(pred, axis=1))
        
        # Get the corresponding string label.
        emotion = change_label(pred)
        
        # Get the corresponding emoji.
        smiley = label_emoji(pred)
        
        # Print the output.
        print(smiley, " : ", emotion)
        
        del sig
        
except KeyboardInterrupt:
    print('Recording has ended!')

Recording...
🤮  :  Disgust
🤮  :  Disgust
🤮  :  Disgust
🤮  :  Disgust
Recording has ended!
