In [11]:
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification

processor = Wav2Vec2FeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

In [23]:
# Use the model to predict emotions from audio inputs
import torch
import librosa
import numpy as np
import soundfile as sf

def predict_emotion(audio_path, sampling_rate=16000, max_duration=30):
    """
    Predict emotion from an audio file
    
    Args:
        audio_path: Path to the audio file
        sampling_rate: Target sampling rate (default: 16000 Hz)
        max_duration: Maximum duration in seconds to process (default: 30)
    
    Returns:
        Dictionary with emotion predictions and scores
    """
    try:
        # Get audio info first to check validity
        info = sf.info(audio_path)
        print(f"Audio info: {info.duration:.2f}s, {info.samplerate}Hz, {info.channels} channel(s)")
        
        # Load audio file with duration limit
        duration = min(info.duration, max_duration) if max_duration else None
        audio, sr = librosa.load(audio_path, sr=sampling_rate, duration=duration)
        
        # Ensure audio is not empty
        if len(audio) == 0:
            raise ValueError("Audio file is empty or could not be loaded")
        
        print(f"Loaded audio: {len(audio)} samples, {len(audio)/sampling_rate:.2f}s")
        
    except Exception as e:
        raise ValueError(f"Error loading audio file: {str(e)}")
    
    # Process audio with feature extractor
    inputs = processor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    
    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get predicted class and confidence
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][predicted_class_id].item()
    
    # Get emotion label
    predicted_emotion = model.config.id2label[predicted_class_id]
    
    # Get all emotion scores
    all_emotions = {model.config.id2label[i]: probabilities[0][i].item() 
                    for i in range(len(model.config.id2label))}
    
    return {
        'emotion': predicted_emotion,
        'confidence': confidence,
        'all_scores': all_emotions
    }

# Example usage:
# result = predict_emotion('path/to/your/audio.wav')
# print(f"Predicted emotion: {result['emotion']}")
# print(f"Confidence: {result['confidence']:.2%}")
# print(f"\nAll emotion scores:")
# for emotion, score in result['all_scores'].items():
#     print(f"  {emotion}: {score:.2%}")

In [None]:
# Test with an audio file
# Replace with the path to your MP3 file
audio_file = 'your_audio.mp3'  # Change this to your MP3 file path

# Predict emotion from the audio file
result = predict_emotion(audio_file)
print(f"\n{'='*50}")
print(f"Predicted emotion: {result['emotion']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"{'='*50}")
print(f"\nAll emotion scores (sorted by confidence):")
for emotion, score in sorted(result['all_scores'].items(), key=lambda x: x[1], reverse=True):
    print(f"  {emotion:12s}: {score:.2%}")

Audio info: 5.12s, 16000Hz, 1 channel(s)
Loaded audio: 81920 samples, 5.12s
Predicted emotion: surprised
Confidence: 13.10%

All emotion scores:
  surprised: 13.10%
  happy: 12.89%
  sad: 12.59%
  angry: 12.48%
  disgust: 12.36%
  fearful: 12.28%
  neutral: 12.16%
  calm: 12.14%


In [33]:
!ffmpeg -i temp3.wav -ar 16000 -ac 1 temp.wav -y

ffmpeg version 7.1.1-1ubuntu1.2 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 14 (Ubuntu 14.2.0-19ubuntu2)
  configuration: --prefix=/usr --extra-version=1ubuntu1.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-libmfx --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg -