In [2]:
import cv2
import numpy as np
import os
import librosa
import pyaudio
import wave
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
import tkinter as tk
from PIL import Image, ImageTk

# Constants for speech emotion model
MFCC_LENGTH = 20
MAX_PAD_LEN = 100
AUDIO_DURATION = 2
RATE = 44100
CHUNK = 1024

# Constants for facial expression model
IMAGE_SIZE = (48, 48)
NUM_CLASSES = 7  
NUM_CHANNELS = 1  

# Loading trained models
speech_model = load_model('speech_emotion_model.h5')
facial_expression_model = load_model('facial_emotion_model.h5')

# emotion labels
emotion_labels = {0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'sad', 6: 'surprise'}
expression_labels = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral'}

# record audio and predict emotion from speech
def predict_speech_emotion():
    try:
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
        frames = []
        print("Recording...")
        for _ in range(0, int(RATE / CHUNK * AUDIO_DURATION)):
            data = stream.read(CHUNK)
            frames.append(data)
        print("Finished recording.")
        stream.stop_stream()
        stream.close()
        p.terminate()

        wf = wave.open('temp_audio.wav', 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()

        signal, sr = librosa.load('temp_audio.wav', duration=AUDIO_DURATION, sr=None)
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=MFCC_LENGTH)
        # Padding or cropping the features to match the required length
        if mfcc.shape[1] < MAX_PAD_LEN:
            mfcc_padded = np.pad(mfcc, ((0, 0), (0, MAX_PAD_LEN - mfcc.shape[1])), mode='constant')
        else:
            mfcc_padded = mfcc[:, :MAX_PAD_LEN]
        mfcc_input = np.expand_dims(mfcc_padded, axis=0)
        prediction = speech_model.predict(mfcc_input)
        max_index = np.argmax(prediction)
        if max_index in emotion_labels:
            emotion_label = emotion_labels[max_index]
        else:
            emotion_label = "Unknown"
        return emotion_label
    except Exception as e:
        print("Error in predict_speech_emotion:", e)
        return "Error"

# Function to predict facial expression from image
def predict_facial_expression(image):
    try:
        # Convert image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Resize and preprocess image
        resized_image = cv2.resize(gray_image, IMAGE_SIZE)
        resized_image = np.expand_dims(resized_image, axis=-1)
        resized_image = np.expand_dims(resized_image, axis=0)
        resized_image = resized_image.astype('float32') / 255.0
        
        # Predict emotion
        prediction = facial_expression_model.predict(resized_image)
        max_index = np.argmax(prediction)
        if max_index in expression_labels:
            expression_label = expression_labels[max_index]
        else:
            expression_label = "Unknown"
        return expression_label
    except Exception as e:
        print("Error in predict_facial_expression:", e)
        return "Error"

# Function to continuously capture frames from webcam and update GUI
def update_gui():
    cap = cv2.VideoCapture(0)
    
    root = tk.Tk()
    root.title("Emotion Recognition")
    
    frame_label = tk.Label(root, text="Facial Expression: ")
    frame_label.pack()
    frame_image_label = tk.Label(root)
    frame_image_label.pack()

    speech_label = tk.Label(root, text="Speech Emotion: ")
    speech_label.pack()
    
    expression_label = tk.Label(root, text="")
    expression_label.pack()
    
    speech_emotion_label = tk.Label(root, text="")
    speech_emotion_label.pack()

    # Function to quit the application
    def quit_application():
        cap.release() 
        cv2.destroyAllWindows()
        root.quit()
        root.destroy()

    # Button to quit the application
    quit_button = tk.Button(root, text="Quit", command=quit_application)
    quit_button.pack()

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame is not None:
            # facial expression label
            facial_expression = predict_facial_expression(frame)
            expression_label.config(text="Facial Expression: {}".format(facial_expression))

            # speech emotion label
            speech_emotion = predict_speech_emotion()
            speech_emotion_label.config(text="Speech Emotion: {}".format(speech_emotion))

            # Display frame in GUI
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frame = ImageTk.PhotoImage(frame)
            frame_image_label.configure(image=frame)
            frame_image_label.image = frame

        root.update_idletasks()
        root.update()

    cap.release()
    cv2.destroyAllWindows()

# Main function
def main():
    update_gui()

if __name__ == "__main__":
    main()


Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
