# Emotion Recognition and Classification in Speech

## Dataset Preparation and Loading

In [None]:
import os
import pandas as pd

def create_ravdess_map_df():
    file_paths = []
    emotions = []
    
    for root, dirs, files in os.walk("Audio_Speech_Actors_01-24"):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                emotion_code = int(file.split('-')[2])
                emotion_map = {1: "neutral", 2: "calm", 3: "happy", 4: "sad", 5: "angry", 6: "fearful", 7: "disgust", 8: "surprised"}
                emotion = emotion_map[emotion_code]
                
                file_paths.append(file_path)
                emotions.append(emotion)
    
    df = pd.DataFrame({"file_path": file_paths, "emotion": emotions})
    df.to_csv("ravdess_metadata.csv", index=False)
    print(f"Dataset prepared with {len(df)} audio files.")
    return df

In [None]:
ravdess_df = create_ravdess_map_df()

In [None]:
from faster_whisper import WhisperModel
import pandas as pd
import json
import os
import torch
device="cuda" if torch.cuda.is_available() else "cpu"


def add_transcripts_to_ravdess(df, output_dir="transcripts", model_size="distil-large-v3"):
    os.makedirs(output_dir, exist_ok=True)
    
    transcripts = []
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = WhisperModel(model_size, device=device, compute_type="float16" if torch.cuda.is_available() else "float32")

    for idx, row in df.iterrows():
        file_path = row["file_path"]
        emotion = row["emotion"]
        
        output_file = os.path.join(output_dir, f"{os.path.basename(file_path).split('.')[0]}.json")
        
        if os.path.exists(output_file):
            print(f"Transcript exists for {file_path}")
            with open(output_file, 'r') as f:
                transcript_data = json.load(f)
            transcript_text = transcript_data["text"]
        else:
            print(f"Processing {file_path}...")
            
            try:
                segments, info = model.transcribe(file_path, beam_size=5, language="en", condition_on_previous_text=False)
                transcript_text = " ".join([segment.text for segment in segments])
                transcript_data = {"text": transcript_text}
                with open(output_file, 'w') as f:
                    json.dump(transcript_data, f)
            
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                transcript_text = ""
        
        transcripts.append({
            "file_path": file_path,
            "emotion": emotion,
            "transcript": transcript_text
        })
    
    return pd.DataFrame(transcripts)

In [None]:
transcripts_df = add_transcripts_to_ravdess(ravdess_df)

### Extracting Audio Features

In [None]:
import librosa
import numpy as np
import torch

def extract_audio_features(file_path, max_length=128000, sr=16000, n_mfcc=13, n_mels=128):
    """Extract audio features (MFCC and Mel spectrogram) from an audio file"""
    audio, _ = librosa.load(file_path, sr=sr)
    
    if len(audio) > max_length:
        audio = audio[:max_length]
    else:
        audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
    
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec)
    
    mfccs = (mfccs - np.mean(mfccs)) / (np.std(mfccs) + 1e-8)
    log_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / (np.std(log_mel_spec) + 1e-8)
    
    features = {
        'mfccs': mfccs,
        'log_mel_spec': log_mel_spec
    }
    
    return features

def prepare_dataset_features(df):
    """Extract features for all audio files in the dataset"""
    audio_features = []
    
    for idx, row in df.iterrows():
        file_path = row["file_path"]
        emotion = row["emotion"]
        transcript = row["transcript"]
        
        features = extract_audio_features(file_path)
        
        audio_features.append({
            "file_path": file_path,
            "emotion": emotion,
            "transcript": transcript,
            "mfccs": features["mfccs"],
            "log_mel_spec": features["log_mel_spec"]
        })
    
    return audio_features

In [None]:
from transformers import BertTokenizer, BertModel

def extract_text_features(transcripts, max_length=64):
    """Extract text features using BERT"""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    
    text_features = []
    
    for transcript in transcripts:
        inputs = tokenizer(transcript, return_tensors="pt", max_length=max_length, 
                          padding="max_length", truncation=True)
        
        with torch.no_grad():
            outputs = model(**inputs)
        text_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        text_features.append(text_embedding[0])
    
    return np.array(text_features)




In [None]:
# Extract features
dataset_features = prepare_dataset_features(transcripts_df)

# Save features
import pickle
with open('ravdess_features.pkl', 'wb') as f:
    pickle.dump(dataset_features, f)

# Extract text features
transcripts = transcripts_df["transcript"].tolist()
text_features = extract_text_features(transcripts)

# Save text features
np.save('ravdess_text_features.npy', text_features)

### Collating final dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

class EmotionDataset(Dataset):
    """Dataset for multimodal emotion recognition"""
    def __init__(self, features, text_features, transform=None):
        self.features = features
        self.text_features = text_features
        self.transform = transform
        
        self.emotions = [item["emotion"] for item in features]
        self.label_encoder = LabelEncoder()
        self.encoded_emotions = self.label_encoder.fit_transform(self.emotions)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        audio_feature = self.features[idx]["log_mel_spec"]
        text_feature = self.text_features[idx]
        emotion = self.encoded_emotions[idx]
        
        audio_tensor = torch.FloatTensor(audio_feature).unsqueeze(0)
        text_tensor = torch.FloatTensor(text_feature)
        emotion_tensor = torch.tensor(emotion, dtype=torch.long)
        
        if self.transform:
            audio_tensor = self.transform(audio_tensor)
        
        return {
            "audio": audio_tensor,
            "text": text_tensor,
            "emotion": emotion_tensor
        }

In [None]:
emotion_dataset = EmotionDataset(dataset_features, text_features)

from sklearn.model_selection import train_test_split
train_indices, val_indices = train_test_split(
    list(range(len(emotion_dataset))), 
    test_size=0.2, 
    random_state=42,
    stratify=[item["emotion"] for item in dataset_features]
)

In [None]:
from torch.utils.data import Subset

train_dataset = Subset(emotion_dataset, train_indices)
val_dataset = Subset(emotion_dataset, val_indices)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Creating Model Architecture

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AudioEncoder(nn.Module):
    """Audio encoder with fixed dimensions"""
    def __init__(self, input_channels=1, hidden_dim=64):
        super().__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        self.fc = nn.Linear(64 * 4 * 4, hidden_dim)
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        return x


class TextEncoder(nn.Module):
    """Text encoder for emotion recognition"""
    def __init__(self, input_dim=768, hidden_dim=64):
        super(TextEncoder, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        
        return x

class EmotionClassifier(nn.Module):
    """Multimodal emotion classifier using audio and text features"""
    def __init__(self, num_emotions=8, audio_hidden_dim=64, text_hidden_dim=64, fusion_dim=128):
        super(EmotionClassifier, self).__init__()
        
        self.audio_encoder = AudioEncoder(hidden_dim=audio_hidden_dim)
        self.text_encoder = TextEncoder(hidden_dim=text_hidden_dim)
        self.fusion = nn.Sequential(
            nn.Linear(audio_hidden_dim + text_hidden_dim, fusion_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(fusion_dim, num_emotions)
        )
        
    def forward(self, audio, text):
        audio_features = self.audio_encoder(audio)
        text_features = self.text_encoder(text)
        combined_features = torch.cat((audio_features, text_features), dim=1)
        output = self.fusion(combined_features)
        return output

## Training the model

In [None]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def train_model(model, train_loader, val_loader, dataset, num_epochs=100, learning_rate=0.001):
    """Train the emotion classification model"""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    best_val_acc = 0.0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_preds = []
        train_true = []
        
        for batch in train_loader:
            audio = batch["audio"].to(device)
            text = batch["text"].to(device)
            emotions = batch["emotion"].to(device)
            
            outputs = model(audio, text)
            loss = criterion(outputs, emotions)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_true.extend(emotions.cpu().numpy())
        
        train_loss = train_loss / len(train_loader)
        train_acc = accuracy_score(train_true, train_preds)
        train_f1 = f1_score(train_true, train_preds, average='weighted')
        
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                audio = batch["audio"].to(device)
                text = batch["text"].to(device)
                emotions = batch["emotion"].to(device)
                
                outputs = model(audio, text)
                loss = criterion(outputs, emotions)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_true.extend(emotions.cpu().numpy())
        
        val_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='weighted')
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_emotion_model.pt")
            print("Saved best model!")

    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig("training_history.png")
    
    model.eval()
    all_preds = []
    all_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            audio = batch["audio"].to(device)
            text = batch["text"].to(device)
            emotions = batch["emotion"].to(device)
            
            outputs = model(audio, text)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_true.extend(emotions.cpu().numpy())
    
    cm = confusion_matrix(all_true, all_preds)
    emotion_labels = dataset.label_encoder.classes_
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=emotion_labels, yticklabels=emotion_labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix.png")
    
    return model

In [None]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
import torch.nn as nn
import os

def save_checkpoint(model, optimizer, epoch, best_val_acc, train_losses, val_losses,
                    train_accuracies, val_accuracies, no_improvement_counter, filename):
    """Save training checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_val_acc': best_val_acc,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies,
        'no_improvement_counter': no_improvement_counter
    }
    torch.save(checkpoint, os.path.join("checkpoints", filename))
    print(f"\nCheckpoint saved as {filename}")


def train_model_es(model, train_loader, val_loader, dataset, num_epochs=100, learning_rate=0.001, 
                   patience=5, delta=0.001, max_checkpoints=3):
    """Train the emotion classification model with early stopping and checkpoint management"""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    best_val_acc = 0.0
    no_improvement_counter = 0
    saved_checkpoints = []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_preds = []
        train_true = []

        for batch in train_loader:
            audio = batch["audio"].to(device)
            text = batch["text"].to(device)
            emotions = batch["emotion"].to(device)
            
            outputs = model(audio, text)
            loss = criterion(outputs, emotions)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_true.extend(emotions.cpu().numpy())
        
        train_loss = train_loss / len(train_loader)
        train_acc = accuracy_score(train_true, train_preds)
        train_f1 = f1_score(train_true, train_preds, average='weighted')
        
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                audio = batch["audio"].to(device)
                text = batch["text"].to(device)
                emotions = batch["emotion"].to(device)
                
                outputs = model(audio, text)
                loss = criterion(outputs, emotions)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_true.extend(emotions.cpu().numpy())
        
        val_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='weighted')
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
        
        if val_acc > (best_val_acc + delta):
            print(f"Validation accuracy improved from {best_val_acc:.4f} to {val_acc:.4f}")
            best_val_acc = val_acc
            no_improvement_counter = 0
            
            checkpoint_name = f"model_{val_acc:.4f}_epoch{epoch+1}_checkpoint.pt"
            save_checkpoint(
                model=model,
                optimizer=optimizer,
                epoch=epoch,
                best_val_acc=best_val_acc,
                train_losses=train_losses,
                val_losses=val_losses,
                train_accuracies=train_accuracies,
                val_accuracies=val_accuracies,
                no_improvement_counter=no_improvement_counter,
                filename=checkpoint_name
            )
            
            checkpoint_name = os.path.join("checkpoints", checkpoint_name)
            saved_checkpoints.append((val_acc, epoch+1, checkpoint_name))
            saved_checkpoints.sort(key=lambda x: (-x[0], -x[1]))
            
            while len(saved_checkpoints) > max_checkpoints:
                removed_acc, removed_epoch, removed_file = saved_checkpoints.pop()
                try:
                    os.remove(removed_file)
                    print(f"Removed old checkpoint: {removed_file}")
                except FileNotFoundError:
                    print(f"Warning: Could not find file {removed_file} to delete")
        else:
            no_improvement_counter += 1
            print(f"No improvement for {no_improvement_counter}/{patience} epochs")
            if no_improvement_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}!")
                break

    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig("training_history.png")
    
    model.eval()
    all_preds = []
    all_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            audio = batch["audio"].to(device)
            text = batch["text"].to(device)
            emotions = batch["emotion"].to(device)
            
            outputs = model(audio, text)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy
                             ())
            all_true.extend(emotions.cpu().numpy())
    
    cm = confusion_matrix(all_true, all_preds)
    emotion_labels = dataset.label_encoder.classes_
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=emotion_labels, yticklabels=emotion_labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix.png")
    
    return model

In [None]:
num_emotions = len(emotion_dataset.label_encoder.classes_)
model = EmotionClassifier(num_emotions=num_emotions)

trained_model = train_model_es(model, train_loader, val_loader, emotion_dataset, num_epochs=400, learning_rate=0.001, patience=40, delta=0.01, max_checkpoints=2)

## Model inference

In [None]:
def load_checkpoint(model, optimizer, filename):
    path = os.path.join("checkpoints", filename)
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return {
        'epoch': checkpoint['epoch'],
        'best_val_acc': checkpoint['best_val_acc'],
        'train_losses': checkpoint['train_losses'],
        'val_losses': checkpoint['val_losses'],
        'train_accuracies': checkpoint['train_accuracies'],
        'val_accuracies': checkpoint['val_accuracies'],
        'no_improvement_counter': checkpoint['no_improvement_counter']
    }

In [None]:
import glob
def load_best_model(model_class=EmotionClassifier, checkpoint_dir='checkpoints', device='cuda' if torch.cuda.is_available() else 'cpu'):
    checkpoints = glob.glob(os.path.join(checkpoint_dir, 'model_*.pt'))
    if not checkpoints:
        raise ValueError("No checkpoints found")
    
    best_loss = float('inf')
    best_checkpoint = None
    
    for ckpt in checkpoints:
        data = torch.load(ckpt, map_location=device)
        min_loss = min(data['val_losses'])
        if min_loss < best_loss:
            best_loss = min_loss
            best_checkpoint = data
            name = ckpt
    
    model = model_class().to(device)
    model.load_state_dict(best_checkpoint['model_state_dict'])
    print(f"Loaded model: {name}")
    return model

In [None]:
model = load_best_model(EmotionClassifier)

In [None]:
def detect_emotion(audio_path):
    whisper_model = WhisperModel("distil-large-v3", compute_type="float16" if torch.cuda.is_available() else "float32")
    segments, info = whisper_model.transcribe(audio_path, beam_size=5, language="en", condition_on_previous_text=False)
    transcript = " ".join([segment.text for segment in segments])
    
    audio_features = extract_audio_features(audio_path)
    audio_tensor = torch.FloatTensor(audio_features["log_mel_spec"]).unsqueeze(0).unsqueeze(0).to(device)
    text_features = extract_text_features([transcript])[0]
    text_tensor = torch.FloatTensor(text_features).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(audio_tensor, text_tensor)
        probs = torch.softmax(outputs, dim=1)
        conf, pred = torch.max(probs, 1)
    
    probs_list = [(label, prob.item()) 
                for label, prob in zip(emotion_dataset.label_encoder.classes_, probs.squeeze())]
    sorted_probs = sorted(probs_list, key=lambda x: x[1], reverse=True)

    print(f"\n{' EMOTION PREDICTION RESULTS ':=^40}")
    print(f"Audio File: {os.path.basename(audio_path)}")
    print(f"\nTranscript: {transcript}\n")
    print(f"{'Top Predictions':-^30}")
    for i, (emotion, prob) in enumerate(sorted_probs[:3], 1):
        print(f"{i}. {emotion.upper():<12} {prob:.2%}")
    print(f"{'':-^30}")
    print(f"Predicted Emotion: {sorted_probs[0][0].upper()} (confidence: {conf.item():.2%})")
    print(f"{'':=^40}\n")

    return {
        "emotion": sorted_probs[0][0],
        "confidence": conf.item(),
        "transcript": transcript,
        "probabilities": sorted_probs
    }

In [None]:
detect_emotion('Audio_Speech_Actors_01-24/Actor_05/03-01-01-01-01-02-05.wav')

## Displaying with a simple UI

In [None]:
import gradio as gr
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gradio_detect_emotion(audio_path):
    result = detect_emotion(audio_path)
    
    top_emotion = f"{result['emotion'].upper()} ({result['confidence']:.2%})"
    prob_dict = {k.upper(): v for k, v in result['probabilities']}
    
    output_text = f"""
{'-'*40}
Audio File: {os.path.basename(audio_path)}
{'-'*40}
Transcript: {result['transcript']}
{'-'*40}
Top Predictions:
""" + "\n".join([f"{i}. {emotion.upper():<12} {prob:.2%}" 
               for i, (emotion, prob) in enumerate(result['probabilities'][:3], 1)]) + f"""
{'-'*40}
Final Prediction: {top_emotion}
{'-'*40}"""
    
    return {
        "transcript": result['transcript'],
        "emotion": top_emotion,
        "probabilities": prob_dict
    }, output_text

with gr.Blocks(theme=gr.themes.Soft(), title="Speech Emotion Recognition") as demo:
    gr.Markdown("# 🎤 Real-Time Speech Emotion Analysis")
    gr.Markdown("Upload an audio file or record using your microphone")
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Input Audio",
            waveform_options=gr.WaveformOptions(waveform_color="#FF6B6B")
        )
        
    with gr.Row():
        text_output = gr.Textbox(label="Transcript", interactive=False)
        emotion_output = gr.Textbox(label="Predicted Emotion", interactive=False)
        
    plot_output = gr.BarPlot(
        label="Emotion Probabilities",
        x="Emotion",
        y="Probability",
        color="Emotion",
        height=300
    )
    
    console_output = gr.Textbox(label="Analysis Details", interactive=False)
    
    audio_input.change(
        fn=gradio_detect_emotion,
        inputs=audio_input,
        outputs=[{"transcript": text_output, "emotion": emotion_output, "probabilities": plot_output}, console_output]
    )
    
demo.launch()

In [None]:
def gradio_detect_emotion(audio_path):
    result = detect_emotion(audio_path)
    
    df = pd.DataFrame({
        "Emotion": [e[0].upper() for e in result['probabilities']],
        "Probability": [e[1] for e in result['probabilities']]
    })
    
    top_emotion = f"{result['emotion'].upper()} ({result['confidence']:.2%})"
    output_text = f"""
Transcript: {result['transcript']}
Predicted Emotion: {top_emotion}
Top 3 Probabilities:
""" + "\n".join([f"{i}. {e[0].upper():<12} {e[1]:.2%}" 
               for i, e in enumerate(result['probabilities'][:3], 1)])

    return result['transcript'], top_emotion, df, output_text

with gr.Blocks(title="Speech Emotion Recognition") as demo:
    gr.Markdown("# 🎤 Real-Time Speech Emotion Analysis")
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Input Audio"
        )
        
    with gr.Row():
        text_output = gr.Textbox(label="Transcript")
        emotion_output = gr.Textbox(label="Predicted Emotion")
        
    plot_output = gr.BarPlot(
        label="Emotion Probabilities",
        x="Emotion",
        y="Probability",
        title="Emotion Distribution",
        height=300
    )
    
    console_output = gr.Textbox(label="Analysis Details")
    
    audio_input.change(
        fn=gradio_detect_emotion,
        inputs=audio_input,
        outputs=[text_output, emotion_output, plot_output, console_output]
    )

demo.launch()
# demo.launch(share=True)