In [1]:
# Speech Emotion Detection using Text and Audio Analysis
# This script requires the following libraries. Install them using pip:
# pip install sounddevice scipy transformers torch SpeechRecognition pyaudio
#
# Note: PyAudio can be tricky to install. You may need to install a pre-compiled
# wheel file for your system from the PyAudio PyPI page. For some Linux systems,
# you may also need to install portaudio19-dev: sudo apt-get install portaudio19-dev
# On macOS, you might need to run `brew install portaudio` first.

import sounddevice as sd
from scipy.io.wavfile import write
import speech_recognition as sr
from transformers import pipeline
import numpy as np
import torch
import warnings

# Suppress Hugging Face warnings for a cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.modeling_utils")

# --- Configuration ---
SAMPLE_RATE = 16000  # Sample rate for audio capture
DURATION = 5  # Duration of recording in seconds

# --- Model Loading ---

print("Loading models... This may take a moment.")
try:
    # 1. Text-based Emotion Model (Hugging Face)
    # This model is fine-tuned for emotion classification from text.
    text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

    # 2. Audio-based Emotion Model (Hugging Face)
    # This model is fine-tuned for emotion classification from raw audio.
    audio_classifier = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
except Exception as e:
    print(f"Error loading models: {e}")
    print("Please ensure you have an internet connection and the transformers and torch libraries are correctly installed.")
    exit()

# --- Audio Capture Function ---
def capture_audio(duration=DURATION, samplerate=SAMPLE_RATE):
    """
    Records audio from the microphone for a specified duration.
    
    Args:
        duration (int): The duration of the recording in seconds.
        samplerate (int): The sample rate of the recording.

    Returns:
        np.ndarray: A NumPy array containing the recorded audio data.
    """
    print(f"\nRecording for {duration} seconds... Please speak clearly.")
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()  # Wait until recording is finished
    print("Recording complete.")
    return recording.squeeze()

# --- Analysis Functions ---
def text_analysis(audio_data):
    """
    Performs emotion detection on the audio by first converting it to text.
    
    Args:
        audio_data (np.ndarray): The recorded audio data.

    Returns:
        dict: A dictionary of emotions and their scores from the text model.
    """
    try:
        # Save the audio data to a temporary WAV file for the speech recognizer
        temp_wav_path = "temp_audio.wav"
        scaled_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)
        write(temp_wav_path, SAMPLE_RATE, scaled_data)

        recognizer = sr.Recognizer()
        with sr.AudioFile(temp_wav_path) as source:
            audio = recognizer.record(source)
            text = recognizer.recognize_google(audio)
            print(f"\nText recognized: '{text}'")

            # Get scores from the text-based emotion classifier
            text_result = text_classifier(text)
            return {item['label']: item['score'] for item in text_result[0]}

    except sr.UnknownValueError:
        print("\nText-based analysis failed: Could not understand audio. Try speaking more clearly.")
        return {}
    except sr.RequestError as e:
        print(f"\nText-based analysis failed: Could not request results from Google Speech Recognition service; {e}")
        return {}
    except Exception as e:
        print(f"\nAn error occurred during text analysis: {e}")
        return {}

def audio_analysis(audio_data):
    """
    Performs emotion detection directly on the audio waveform.
    
    Args:
        audio_data (np.ndarray): The recorded audio data.

    Returns:
        dict: A dictionary of emotions and their scores from the audio model.
    """
    try:
        # The audio_classifier expects a specific format.
        # We need to wrap the numpy array in a dictionary with the 'sampling_rate' key.
        audio_input = {"raw": audio_data, "sampling_rate": SAMPLE_RATE}
        
        # Get scores from the audio-based emotion classifier
        audio_result = audio_classifier(audio_input)
        return {item['label']: item['score'] for item in audio_result}
    except Exception as e:
        print(f"\nAn error occurred during audio analysis: {e}")
        return {}

def combine_results(text_scores, audio_scores):
    """
    Combines the results from the two analysis methods.
    It takes a simple average of the scores for common emotions.
    
    Args:
        text_scores (dict): Scores from the text-based model.
        audio_scores (dict): Scores from the audio-based model.

    Returns:
        tuple: A tuple containing the combined scores (dict) and the final predicted emotion (str).
    """
    combined_scores = {}
    
    # Identify common emotions to average
    common_emotions = set(text_scores.keys()).intersection(set(audio_scores.keys()))
    
    # Simple averaging for common emotions
    for emotion in common_emotions:
        combined_scores[emotion] = (text_scores.get(emotion, 0) + audio_scores.get(emotion, 0)) / 2
        
    # Add unique emotions with their original scores
    for emotion, score in text_scores.items():
        if emotion not in common_emotions:
            combined_scores[emotion] = score
    
    for emotion, score in audio_scores.items():
        if emotion not in common_emotions:
            combined_scores[emotion] = score

    # Determine the most likely emotion
    if combined_scores:
        final_emotion = max(combined_scores, key=combined_scores.get)
    else:
        final_emotion = "Undetermined"
        
    return combined_scores, final_emotion

# --- Main Function ---
def main():
    """Main function to run the emotion detection process."""
    try:
        # Step 1: Record audio
        audio_data = capture_audio()
        
        # Step 2: Perform text-based analysis
        text_scores = text_analysis(audio_data)

        # Step 3: Perform audio-based analysis
        audio_scores = audio_analysis(audio_data)

        # Step 4: Combine and display results
        if not text_scores and not audio_scores:
            print("\nCould not perform emotion analysis. Please check your microphone and try again.")
            return

        print("\n--- Analysis Results ---")
        
        print("\nMethod 1: Text-based Analysis (from Hugging Face Model)")
        if text_scores:
            for emotion, score in sorted(text_scores.items(), key=lambda item: item[1], reverse=True):
                print(f"  {emotion.capitalize()}: {score:.2f}")
        else:
            print("  Analysis failed.")
        
        print("\nMethod 2: Audio-based Analysis (from Hugging Face Model)")
        if audio_scores:
            for emotion, score in sorted(audio_scores.items(), key=lambda item: item[1], reverse=True):
                print(f"  {emotion.capitalize()}: {score:.2f}")
        else:
            print("  Analysis failed.")
        
        # Combine the results
        combined_scores, final_emotion = combine_results(text_scores, audio_scores)

        print("\n--- Final Combined Outcome ---")
        print("Final Prediction:", final_emotion.capitalize())
        print("Combined Scores:")
        for emotion, score in sorted(combined_scores.items(), key=lambda item: item[1], reverse=True):
            print(f"  {emotion.capitalize()}: {score:.2f}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()


Loading models... This may take a moment.



Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-e


Recording for 5 seconds... Please speak clearly.
Recording complete.

Text recognized: 'hello I want to win this'

--- Analysis Results ---

Method 1: Text-based Analysis (from Hugging Face Model)
  Neutral: 0.61
  Surprise: 0.27
  Sadness: 0.06
  Anger: 0.03
  Joy: 0.02
  Fear: 0.01
  Disgust: 0.00

Method 2: Audio-based Analysis (from Hugging Face Model)
  Happy: 0.13
  Neutral: 0.13
  Disgust: 0.13
  Calm: 0.13
  Surprised: 0.13

--- Final Combined Outcome ---
Final Prediction: Neutral
Combined Scores:
  Neutral: 0.37
  Surprise: 0.27
  Happy: 0.13
  Calm: 0.13
  Surprised: 0.13
  Disgust: 0.07
  Sadness: 0.06
  Anger: 0.03
  Joy: 0.02
  Fear: 0.01
