# Speech Emotion Recognition (SER)

This notebook builds a Speech Emotion Recognition model using 4 popular datasets:
- RAVDESS
- CREMA-D
- TESS
- SAVEE

## Emotions Covered:
- Angry
- Disgust
- Fear
- Happy
- Neutral
- Sad
- Surprise (in some datasets)

## 0. Install Dependencies

In [None]:
import sys
import subprocess
import platform

# List of required packages
packages = [
    'kagglehub',
    'numpy',
    'pandas',
    'librosa',
    'matplotlib',
    'seaborn',
    'scikit-learn',
    'soundfile'  # Required by librosa for audio file handling
]

print("Installing dependencies...")
print(f"Python version: {sys.version}")
print(f"Platform: {platform.system()} {platform.machine()}\n")

# Install common packages
for package in packages:
    try:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])
        print(f"✓ {package} installed")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")

# Install TensorFlow based on platform
print("\nInstalling TensorFlow...")
try:
    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
        # macOS with Apple Silicon
        print("Detected macOS with Apple Silicon")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'tensorflow-macos'])
        print("✓ tensorflow-macos installed")
    else:
        # Other platforms
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'tensorflow'])
        print("✓ tensorflow installed")
except subprocess.CalledProcessError:
    print("✗ TensorFlow installation failed")
    print("⚠ TensorFlow may not support your Python version yet")
    print(f"  Current Python: {sys.version_info.major}.{sys.version_info.minor}")
    print("  Try using Python 3.9, 3.10, 3.11, or 3.12")
    print("\nAlternatives:")
    print("1. Create a new virtual environment with Python 3.11:")
    print("   python3.11 -m venv .venv")
    print("   source .venv/bin/activate")
    print("2. Use conda: conda create -n ser python=3.11 tensorflow")

print("\n" + "="*50)
print("Installation complete!")

Installing dependencies...
Python version: 3.14.0 (main, Oct  7 2025, 09:34:52) [Clang 17.0.0 (clang-1700.4.4.1)]
Platform: Darwin arm64

Installing kagglehub...
✓ kagglehub installed
Installing numpy...
✓ numpy installed
Installing pandas...
✓ pandas installed
Installing librosa...


## 1. Download Dataset from Kaggle

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dmitrybabko/speech-emotion-recognition-en")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/dmitrybabko/speech-emotion-recognition-en?dataset_version_number=1...


100%|██████████| 987M/987M [17:26<00:00, 989kB/s]  

Extracting files...





Path to dataset files: /Users/ismatsamadov/.cache/kagglehub/datasets/dmitrybabko/speech-emotion-recognition-en/versions/1


## 2. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

print("TensorFlow version:", tf.__version__)
print("Librosa version:", librosa.__version__)

ModuleNotFoundError: No module named 'pandas.util._print_versions'

## 3. Explore Dataset Structure

In [None]:
# List all files in the dataset directory
import glob

# Update this path based on where kagglehub downloaded the data
data_path = path

# Find all subdirectories
for root, dirs, files in os.walk(data_path):
    print(f"Directory: {root}")
    print(f"Subdirectories: {dirs}")
    print(f"Number of files: {len(files)}")
    if len(files) > 0 and files[0].endswith('.wav'):
        print(f"Sample files: {files[:3]}")
    print("-" * 80)

## 4. Data Loading and Label Extraction

We'll create functions to extract emotion labels from each dataset based on their naming conventions.

In [None]:
def get_emotion_ravdess(filename):
    """
    Extract emotion from RAVDESS filename
    Format: 03-01-{emotion}-01-01-01-01.wav
    Emotions: 01=neutral, 02=calm, 03=happy, 04=sad, 05=angry, 06=fearful, 07=disgust, 08=surprised
    """
    emotion_map = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fear',
        '07': 'disgust',
        '08': 'surprise'
    }
    parts = os.path.basename(filename).split('-')
    if len(parts) >= 3:
        return emotion_map.get(parts[2], 'unknown')
    return 'unknown'

def get_emotion_crema(filename):
    """
    Extract emotion from CREMA-D filename
    Format: 1001_DFA_ANG_XX.wav
    Emotions: SAD, ANG, DIS, FEA, HAP, NEU
    """
    emotion_map = {
        'SAD': 'sad',
        'ANG': 'angry',
        'DIS': 'disgust',
        'FEA': 'fear',
        'HAP': 'happy',
        'NEU': 'neutral'
    }
    parts = os.path.basename(filename).split('_')
    if len(parts) >= 3:
        return emotion_map.get(parts[2], 'unknown')
    return 'unknown'

def get_emotion_tess(filename):
    """
    Extract emotion from TESS filename
    Format: YAF_dog_angry.wav
    Emotions are in the filename
    """
    basename = os.path.basename(filename).lower()
    if 'angry' in basename or 'anger' in basename:
        return 'angry'
    elif 'disgust' in basename:
        return 'disgust'
    elif 'fear' in basename:
        return 'fear'
    elif 'happy' in basename or 'happiness' in basename:
        return 'happy'
    elif 'neutral' in basename:
        return 'neutral'
    elif 'sad' in basename or 'sadness' in basename:
        return 'sad'
    elif 'surprise' in basename or 'surprised' in basename:
        return 'surprise'
    elif 'pleasant' in basename:
        return 'happy'
    return 'unknown'

def get_emotion_savee(filename):
    """
    Extract emotion from SAVEE filename
    Format: a01.wav, d01.wav, etc.
    a=anger, d=disgust, f=fear, h=happiness, n=neutral, sa=sadness, su=surprise
    """
    basename = os.path.basename(filename).lower()
    if basename.startswith('a'):
        return 'angry'
    elif basename.startswith('d'):
        return 'disgust'
    elif basename.startswith('f'):
        return 'fear'
    elif basename.startswith('h'):
        return 'happy'
    elif basename.startswith('n'):
        return 'neutral'
    elif basename.startswith('sa'):
        return 'sad'
    elif basename.startswith('su'):
        return 'surprise'
    return 'unknown'

def get_emotion(filepath):
    """
    Automatically detect dataset type and extract emotion
    """
    filepath_lower = filepath.lower()
    
    if 'ravdess' in filepath_lower or 'actor_' in filepath_lower:
        return get_emotion_ravdess(filepath)
    elif 'crema' in filepath_lower:
        return get_emotion_crema(filepath)
    elif 'tess' in filepath_lower:
        return get_emotion_tess(filepath)
    elif 'savee' in filepath_lower:
        return get_emotion_savee(filepath)
    else:
        # Try to infer from filename
        if '-' in os.path.basename(filepath) and len(os.path.basename(filepath).split('-')) > 5:
            return get_emotion_ravdess(filepath)
        elif '_' in os.path.basename(filepath):
            # Try CREMA first, then TESS
            emotion = get_emotion_crema(filepath)
            if emotion == 'unknown':
                emotion = get_emotion_tess(filepath)
            return emotion
        else:
            return get_emotion_savee(filepath)
    
# Test the functions
print("Testing emotion extraction:")
print("RAVDESS test:", get_emotion_ravdess("03-01-05-01-01-01-01.wav"))
print("CREMA test:", get_emotion_crema("1001_DFA_ANG_XX.wav"))
print("TESS test:", get_emotion_tess("YAF_dog_angry.wav"))
print("SAVEE test:", get_emotion_savee("a01.wav"))

## 5. Feature Extraction

We'll extract multiple audio features:
- MFCC (Mel-frequency cepstral coefficients)
- Chroma
- Mel Spectrogram
- Zero Crossing Rate
- Spectral Features

In [None]:
def extract_features(file_path, duration=3, sr=22050):
    """
    Extract comprehensive audio features from audio file
    
    Parameters:
    - file_path: path to audio file
    - duration: maximum duration to load (in seconds)
    - sr: sampling rate
    
    Returns:
    - features: numpy array of concatenated features
    """
    try:
        # Load audio file
        audio, sample_rate = librosa.load(file_path, duration=duration, sr=sr)
        
        # Extract MFCC features (40 coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        
        # Extract Chroma features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
        chroma_mean = np.mean(chroma.T, axis=0)
        
        # Extract Mel Spectrogram
        mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
        mel_mean = np.mean(mel.T, axis=0)
        
        # Extract Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sample_rate)
        contrast_mean = np.mean(contrast.T, axis=0)
        
        # Extract Tonnetz (Tonal Centroid Features)
        tonnetz = librosa.feature.tonnetz(y=audio, sr=sample_rate)
        tonnetz_mean = np.mean(tonnetz.T, axis=0)
        
        # Extract Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(audio)
        zcr_mean = np.mean(zcr)
        
        # Extract Spectral Rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sample_rate)
        rolloff_mean = np.mean(spectral_rolloff)
        
        # Concatenate all features
        features = np.hstack([
            mfccs_mean,
            chroma_mean,
            mel_mean,
            contrast_mean,
            tonnetz_mean,
            zcr_mean,
            rolloff_mean
        ])
        
        return features
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

print("Feature extraction function ready.")

## 6. Load All Audio Files and Extract Features

In [None]:
# Collect all .wav files from the dataset
audio_files = []
for root, dirs, files in os.walk(data_path):
    for file in files:
        if file.endswith('.wav'):
            audio_files.append(os.path.join(root, file))

print(f"Total audio files found: {len(audio_files)}")

# Display sample file paths
if len(audio_files) > 0:
    print("\nSample file paths:")
    for i in range(min(5, len(audio_files))):
        print(f"{i+1}. {audio_files[i]}")
        print(f"   Detected emotion: {get_emotion(audio_files[i])}")

In [None]:
# Extract features and labels for all audio files
print("Extracting features from all audio files...")
print("This may take several minutes depending on the dataset size.\n")

features_list = []
labels_list = []
file_paths = []

# Process files with progress indication
total_files = len(audio_files)
for idx, file_path in enumerate(audio_files, 1):
    if idx % 100 == 0:
        print(f"Processing: {idx}/{total_files} files ({idx/total_files*100:.1f}%)")
    
    # Extract features
    features = extract_features(file_path)
    
    if features is not None:
        # Get emotion label
        emotion = get_emotion(file_path)
        
        if emotion != 'unknown':
            features_list.append(features)
            labels_list.append(emotion)
            file_paths.append(file_path)

print(f"\nFeature extraction complete!")
print(f"Successfully processed: {len(features_list)} files")
print(f"Skipped: {total_files - len(features_list)} files")

## 7. Data Analysis and Visualization

In [None]:
# Create DataFrame for easier analysis
df = pd.DataFrame({
    'file_path': file_paths,
    'emotion': labels_list
})

print("Dataset Overview:")
print(f"Total samples: {len(df)}")
print(f"\nEmotion distribution:")
print(df['emotion'].value_counts())

# Visualize emotion distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='emotion', order=df['emotion'].value_counts().index)
plt.title('Emotion Distribution in Dataset', fontsize=16)
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display dataset info
print(f"\nFeature vector shape: {features_list[0].shape}")
print(f"Number of features per sample: {len(features_list[0])}")

## 8. Data Preparation for Training

In [None]:
# Convert lists to numpy arrays
X = np.array(features_list)
y = np.array(labels_list)

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

print(f"\nEncoded labels shape: {y_categorical.shape}")
print(f"Number of emotion classes: {len(label_encoder.classes_)}")
print(f"Emotion classes: {label_encoder.classes_}")

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_categorical, test_size=0.3, random_state=42, stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\nData standardization complete.")

## 9. Build Deep Learning Model

In [None]:
def create_model(input_shape, num_classes):
    """
    Create a deep neural network for speech emotion recognition
    """
    model = models.Sequential([
        # Input layer
        layers.Dense(512, activation='relu', input_shape=(input_shape,)),
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        
        # Hidden layers
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.BatchNormalization(),
        
        # Output layer
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Create the model
num_features = X_train_scaled.shape[1]
num_classes = y_categorical.shape[1]

model = create_model(num_features, num_classes)

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
model.summary()

## 10. Train the Model

In [None]:
# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-7,
    verbose=1
)

model_checkpoint = ModelCheckpoint(
    'best_ser_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Train the model
print("Starting model training...\n")

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr, model_checkpoint],
    verbose=1
)

print("\nTraining complete!")

## 11. Visualize Training History

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot accuracy
axes[0].plot(history.history['accuracy'], label='Train Accuracy')
axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0].set_title('Model Accuracy', fontsize=14)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot loss
axes[1].plot(history.history['loss'], label='Train Loss')
axes[1].plot(history.history['val_loss'], label='Validation Loss')
axes[1].set_title('Model Loss', fontsize=14)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 12. Evaluate Model Performance

In [None]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Classification report
print("\n" + "="*60)
print("Classification Report:")
print("="*60)
print(classification_report(
    y_test_classes,
    y_pred_classes,
    target_names=label_encoder.classes_
))

## 13. Confusion Matrix Visualization

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.title('Confusion Matrix - Speech Emotion Recognition', fontsize=16)
plt.xlabel('Predicted Emotion', fontsize=12)
plt.ylabel('True Emotion', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate and display per-class accuracy
print("\nPer-Class Accuracy:")
print("="*40)
for i, emotion in enumerate(label_encoder.classes_):
    class_accuracy = cm[i, i] / cm[i].sum() * 100
    print(f"{emotion.capitalize()}: {class_accuracy:.2f}%")

## 14. Save Model and Preprocessing Objects

In [None]:
import pickle

# Save the final model
model.save('speech_emotion_recognition_model.keras')
print("Model saved as 'speech_emotion_recognition_model.keras'")

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved as 'scaler.pkl'")

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("Label encoder saved as 'label_encoder.pkl'")

print("\nAll model artifacts saved successfully!")

## 15. Prediction Function for New Audio Files

In [None]:
def predict_emotion(audio_file_path, model, scaler, label_encoder):
    """
    Predict emotion from a new audio file
    
    Parameters:
    - audio_file_path: path to the audio file
    - model: trained Keras model
    - scaler: fitted StandardScaler
    - label_encoder: fitted LabelEncoder
    
    Returns:
    - predicted_emotion: predicted emotion label
    - probabilities: dictionary of emotion probabilities
    """
    # Extract features
    features = extract_features(audio_file_path)
    
    if features is None:
        return None, None
    
    # Reshape and scale features
    features_scaled = scaler.transform(features.reshape(1, -1))
    
    # Predict
    prediction = model.predict(features_scaled, verbose=0)
    predicted_class = np.argmax(prediction)
    predicted_emotion = label_encoder.classes_[predicted_class]
    
    # Get probabilities for all emotions
    probabilities = {}
    for i, emotion in enumerate(label_encoder.classes_):
        probabilities[emotion] = prediction[0][i] * 100
    
    return predicted_emotion, probabilities

# Test prediction on a random test sample
if len(file_paths) > 0:
    test_file = file_paths[0]
    print(f"Testing prediction on: {os.path.basename(test_file)}")
    print(f"Actual emotion: {get_emotion(test_file)}")
    
    predicted_emotion, probabilities = predict_emotion(test_file, model, scaler, label_encoder)
    
    if predicted_emotion:
        print(f"\nPredicted emotion: {predicted_emotion}")
        print("\nProbabilities for all emotions:")
        for emotion, prob in sorted(probabilities.items(), key=lambda x: x[1], reverse=True):
            print(f"  {emotion.capitalize()}: {prob:.2f}%")

## 16. Visualize Sample Audio and Predictions

In [None]:
def visualize_audio_and_prediction(audio_file_path, model, scaler, label_encoder):
    """
    Visualize audio waveform, spectrogram, and emotion prediction
    """
    # Load audio
    y, sr = librosa.load(audio_file_path, duration=3)
    
    # Get prediction
    predicted_emotion, probabilities = predict_emotion(
        audio_file_path, model, scaler, label_encoder
    )
    actual_emotion = get_emotion(audio_file_path)
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Waveform
    axes[0, 0].set_title('Waveform', fontsize=12)
    librosa.display.waveshow(y, sr=sr, ax=axes[0, 0])
    axes[0, 0].set_xlabel('Time')
    axes[0, 0].set_ylabel('Amplitude')
    
    # Spectrogram
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[0, 1])
    axes[0, 1].set_title('Spectrogram', fontsize=12)
    fig.colorbar(img, ax=axes[0, 1], format='%+2.0f dB')
    
    # MFCC
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    img2 = librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=axes[1, 0])
    axes[1, 0].set_title('MFCC', fontsize=12)
    fig.colorbar(img2, ax=axes[1, 0])
    
    # Prediction probabilities
    emotions = list(probabilities.keys())
    probs = list(probabilities.values())
    colors = ['green' if e == predicted_emotion else 'blue' for e in emotions]
    
    axes[1, 1].barh(emotions, probs, color=colors, alpha=0.7)
    axes[1, 1].set_xlabel('Probability (%)', fontsize=10)
    axes[1, 1].set_title(
        f'Predicted: {predicted_emotion} | Actual: {actual_emotion}',
        fontsize=12,
        fontweight='bold'
    )
    axes[1, 1].grid(True, alpha=0.3, axis='x')
    
    plt.suptitle(
        f'Audio Analysis: {os.path.basename(audio_file_path)}',
        fontsize=14,
        fontweight='bold'
    )
    plt.tight_layout()
    plt.show()

# Visualize a sample
if len(file_paths) > 0:
    sample_file = file_paths[10] if len(file_paths) > 10 else file_paths[0]
    visualize_audio_and_prediction(sample_file, model, scaler, label_encoder)

## Summary

This notebook successfully:

1. Downloaded the Speech Emotion Recognition dataset from Kaggle
2. Processed 4 different datasets (RAVDESS, CREMA-D, TESS, SAVEE)
3. Extracted comprehensive audio features (MFCC, Chroma, Mel Spectrogram, etc.)
4. Built a deep neural network for emotion classification
5. Trained the model with proper validation and callbacks
6. Evaluated the model performance
7. Created visualization tools for predictions
8. Saved the trained model for future use

The model can now predict emotions from speech audio files!