In [2]:
import numpy as np
import librosa
from tensorflow.keras.models import load_model
import sounddevice as sd
from scipy.io.wavfile import write
import tensorflow as tf
from IPython.display import Audio, display


In [3]:
# Load the model
model = load_model('audio_emotion_detection.h5')

# Parameters
sample_rate = 22050  # Sample rate for audio recording
duration = 3  # Duration of audio recording in seconds



In [4]:
def record_audio(duration, sample_rate):
    print("Recording...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait until recording is finished
    audio = np.squeeze(audio)
    display(Audio(audio, rate=sample_rate))
    return audio


In [5]:
# Zero Crossing Rate
def zcr(data, frame_length=2048, hop_length=512):
    zcr = librosa.feature.zero_crossing_rate(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

# Root Mean Square Energy
def rmse(data, frame_length=2048, hop_length=512):
    rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rmse)

# Mel-Frequency Cepstral Coefficients
def mfcc(data, sr, n_fft=2048, hop_length=512, flatten=True):
    mfcc_feature = librosa.feature.mfcc(y=data, sr=sr, n_fft=n_fft, hop_length=hop_length)
    return np.squeeze(mfcc_feature.T) if not flatten else np.ravel(mfcc_feature.T)

In [6]:

def extract_features(audio, sr, frame_length=2048, hop_length=512):
    zcr_val = zcr(audio, frame_length=frame_length, hop_length=hop_length).mean()
    rmse_val = rmse(audio, frame_length=frame_length, hop_length=hop_length).mean()
    mfcc_val = mfcc(audio, sr=sr)
    features = np.hstack([zcr_val, rmse_val, mfcc_val])
    return features

def preprocess_features(features):
    # Assuming the model expects an input shape of (None, 2376, 1)
    # Zero pad or truncate to match expected input size of 2376
    padded_features = np.zeros(2376)
    padded_features[:min(2376, features.shape[0])] = features[:2376]
    padded_features = np.expand_dims(padded_features, axis=-1)  # Shape (2376, 1)
    return np.expand_dims(padded_features, axis=0)  # Shape (1, 2376, 1)

def predict_emotion(audio):
    # Extract features from audio
    features = extract_features(audio, sample_rate)
    # Preprocess features
    processed_features = preprocess_features(features)
    # Predict emotion
    prediction = model.predict(processed_features)
    emotion = np.argmax(prediction, axis=1)
    emotion_map = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Neutral', 5: 'Sad', 6: 'Surprise'}
    # print("Detected Emotion:", emotion_map.get(emotion[0], "Unknown"))
    return emotion_map.get(emotion[0], "Unknown")

In [7]:
model.summary()

# Recorded Clips

In [8]:
file_path = "./CREMA-D/AudioWAV/1091_TIE_ANG_XX.wav"
audio, sr = librosa.load(file_path, sr=22050)
predict_emotion(audio)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 766ms/step


'Angry'

In [9]:
file_path = "./RAVDESS/Actor_10/03-01-07-02-01-02-10.wav"
audio, sr = librosa.load(file_path, sr=22050)
predict_emotion(audio)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step


'Sad'

In [14]:
file_path = "./final testing/DC_a10.wav"
audio, sr = librosa.load(file_path, sr=22050)
predict_emotion(audio)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step


'Disgust'

# Real Time -- Recording + Detection

In [10]:
from collections import Counter

def real_time_emotion_detection(sample_rate=22050, segment_duration=5, total_duration=20):
    emotions = []
    num_segments = total_duration // segment_duration  # Number of segments to process

    print("Starting real-time emotion detection...")
    
    for segment in range(num_segments):
        print(f"Recording segment {segment + 1}/{num_segments}...")
        audio = record_audio(segment_duration, sample_rate)
        
        prediction = predict_emotion(audio)
        emotions.append(prediction)
        
        print(f"Predicted Emotion for segment {segment + 1}: {prediction}")

    # Determine the most frequent emotion
    emotion_counts = Counter(emotions)
    dominant_emotion = emotion_counts.most_common(1)[0][0]
    
    print(f"The dominant emotion is: {dominant_emotion}")
    return dominant_emotion


In [21]:
real_time_emotion_detection()

Starting real-time emotion detection...
Recording segment 1/4...
Recording...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Predicted Emotion for segment 1: Surprise
Recording segment 2/4...
Recording...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
Predicted Emotion for segment 2: Surprise
Recording segment 3/4...
Recording...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Predicted Emotion for segment 3: Surprise
Recording segment 4/4...
Recording...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Predicted Emotion for segment 4: Surprise
The dominant emotion is: Surprise


'Surprise'