# emotion2vec

emotion2vec was pre-trained on large datasets to learn meaningful acoustic-emotional patterns, creating a universal emotional representation space. The embeddings can be used for:

* Advanced visualization via dimensionality reduction (t-SNE/UMAP)
* Custom emotion classifiers (the basic labels are just a simple classifier on top of these embeddings)
* Cross-corpus transfer learning
* Detecting emotional intensity gradients and mixed emotions



In [None]:
from funasr import AutoModel

# model="iic/emotion2vec_base"
# model="iic/emotion2vec_base_finetuned"
# model="iic/emotion2vec_plus_seed"
# model="iic/emotion2vec_plus_base"
model_id = "iic/emotion2vec_plus_large"

model = AutoModel(
    model=model_id,
    hub="hf",  # "ms" or "modelscope" for China mainland users; "hf" or "huggingface" for other overseas users
)

In [None]:
import librosa

# 2. Load audio file
audio_path = "../data/audiotest.wav"
waveform, sample_rate = librosa.load(audio_path, sr=16000)  # Resample to 16kHz


## Output
The emotion labels (angry, sad, etc.) are just the final classification layer's output. 

The emotion embeddings, which are high-dimensional vectors (1024 dimensions in this case) that:

* Capture emotional intensity gradients
* Represent mixed emotions better than single labels
* Allow cross-corpus transfer learning
* Enable visualization of emotional speech in embedding space


The Emotion2vec model achieves state-of-the-art performance on the IEMOCAP dataset using only linear classification layers on top of these embeddings. This suggest that these embeddings have good discriminative power to detect emotional voice characteristics.

In [None]:
rec_result = model.generate(waveform, 
                           output_dir="./outputs", 
                           granularity="utterance", 
                           extract_embedding=True)
rec_embeddings=rec_result[0]['feats']
print(f"Embedding dimensionality: {rec_embeddings.shape}")

# Tracking Emotional tone changes in a sample audio segment

In [None]:

import librosa
import numpy as np
import matplotlib.pyplot as plt
from funasr import AutoModel
import pandas as pd

# Define parameters for segmentation
segment_length = 3  # seconds
hop_length = 1      # seconds (for overlapping windows)

# Calculate samples
segment_samples = int(segment_length * sample_rate)
hop_samples = int(hop_length * sample_rate)

# Create temporary segment files and analyze
emotions_over_time = []
timestamps = []

for start_idx in range(0, len(waveform) - segment_samples + 1, hop_samples):
    # Extract segment
    segment = waveform[start_idx:start_idx + segment_samples]

    # Save temporary segment
    temp_path = "temp_segment.wav"
    import soundfile as sf
    sf.write(temp_path, segment, sample_rate)

    # Analyze segment
    result = model.generate(temp_path, extract_embedding=False)

    # Get emotion scores
    scores = result[0]['scores']
    labels = result[0]['labels']

    # Record timestamp and scores
    timestamp = start_idx / sample_rate
    timestamps.append(timestamp)
    emotions_over_time.append(scores)

# Convert to DataFrame for easier plotting
df = pd.DataFrame(emotions_over_time, index=timestamps)
df.columns = [label.split('/')[1] if '/' in label else label for label in labels]  # Use English labels

# Plot emotion changes over time
plt.figure(figsize=(15, 6))
for emotion in df.columns:
    plt.plot(df.index, df[emotion], label=emotion)

plt.xlabel('Time (seconds)')
plt.ylabel('Emotion Probability')
plt.title('Emotion Changes Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Embeddings compression for a 2D visual on emotion signal overtime

In [None]:
embeddings_over_time = []

for start_idx in range(0, len(waveform) - segment_samples + 1, hop_samples):
    # Extract segment
    segment = waveform[start_idx:start_idx + segment_samples]

    # Save temporary segment
    temp_path = "temp_segment.wav"
    import soundfile as sf
    sf.write(temp_path, segment, sample_rate)

    # Analyze segment
    result = model.generate(temp_path, extract_embedding=True)

    # Store embedding with timestamp using the correct key
    timestamp = start_idx / sample_rate
    embeddings_over_time.append({
        'timestamp': timestamp,
        'embedding': result[0]['feats']  # Changed from 'embedding' to 'feats'
    })


# Use UMAP to visualize the emotional trajectory
from umap import UMAP
import numpy as np

embeddings = np.array([e['embedding'] for e in embeddings_over_time])
timestamps = [e['timestamp'] for e in embeddings_over_time]

# Reduce to 2D for visualization
reducer = UMAP(random_state=42,n_neighbors=15, min_dist=0.1, n_components=2)
embedding_2d = reducer.fit_transform(embeddings)

# Create a connected line plot instead of scattered points
plt.figure(figsize=(10, 8))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=timestamps, cmap='viridis', alpha=0.7)
plt.plot(embedding_2d[:, 0], embedding_2d[:, 1], 'k-', alpha=0.3)  # Connect points with lines
plt.colorbar(label='Time (seconds)')
plt.title('Emotional Trajectory Over Time')
plt.show()