In [1]:
!pip install numpy librosa scipy pesq pystoi

Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.4-cp311-cp311-linux_x86_64.whl size=275944 sha256=fe296d0487a0a06529ee035488c83c362569e8e8fc4bca84da6db27fffa43303
  Stored in directory: /root/.cache/pip/wheels/ae/f1/23/2698d0bf31eec2b2aa50623b5d93b6206c49c7155d0e31345d
Successfully built pesq
Installing collected packages: pesq, pystoi
Successfully installed pesq-0.0.4 pystoi-0.4.1


In [14]:
import numpy as np
import librosa
import scipy.stats
from pesq import pesq
from pystoi import stoi
import json
import csv
import pandas as pd
from datetime import datetime

def evaluate_tts_outputs(reference_path, generated_paths, target_sr=16000):
    """
    Evaluate multiple TTS outputs against a reference audio using various metrics.

    Parameters:
    reference_path (str): Path to reference audio file
    generated_paths (dict): Dictionary of model names and paths to their generated audio files
    target_sr (int): Target sample rate - must be either 8000 or 16000 for PESQ

    Returns:
    dict: Dictionary containing evaluation metrics for each model
    """
    if target_sr not in [8000, 16000]:
        raise ValueError("Sample rate must be either 8000 or 16000 Hz for PESQ evaluation")

    # Load and resample reference audio
    ref_audio, orig_sr = librosa.load(reference_path, sr=None)
    if orig_sr != target_sr:
        ref_audio = librosa.resample(ref_audio, orig_sr=orig_sr, target_sr=target_sr)

    results = {}

    for model_name, audio_path in generated_paths.items():
        # Load and resample generated audio
        gen_audio, gen_sr = librosa.load(audio_path, sr=None)
        if gen_sr != target_sr:
            gen_audio = librosa.resample(gen_audio, orig_sr=gen_sr, target_sr=target_sr)

        # Ensure both audios have the same length
        min_len = min(len(ref_audio), len(gen_audio))
        ref_audio_trim = ref_audio[:min_len]
        gen_audio_trim = gen_audio[:min_len]

        metrics = {}

        # 1. PESQ (Perceptual Evaluation of Speech Quality)
        try:
            # Use 'nb' mode for 8000 Hz, 'wb' mode for 16000 Hz
            mode = 'nb' if target_sr == 8000 else 'wb'
            metrics['pesq'] = pesq(target_sr, ref_audio_trim, gen_audio_trim, mode)
        except Exception as e:
            print(f"PESQ calculation failed for {model_name}: {str(e)}")
            metrics['pesq'] = None

        # 2. STOI (Short-Time Objective Intelligibility)
        try:
            metrics['stoi'] = stoi(ref_audio_trim, gen_audio_trim, target_sr)
        except Exception as e:
            print(f"STOI calculation failed for {model_name}: {str(e)}")
            metrics['stoi'] = None

        # 3. Mel Cepstral Distortion (MCD)
        n_mels = 80
        ref_mel = librosa.feature.melspectrogram(y=ref_audio_trim, sr=target_sr, n_mels=n_mels)
        gen_mel = librosa.feature.melspectrogram(y=gen_audio_trim, sr=target_sr, n_mels=n_mels)
        metrics['mcd'] = np.mean(np.sqrt(np.sum((ref_mel - gen_mel) ** 2, axis=0)))

        # 4. Pitch Correlation
        ref_pitch, _ = librosa.piptrack(y=ref_audio_trim, sr=target_sr)
        gen_pitch, _ = librosa.piptrack(y=gen_audio_trim, sr=target_sr)
        ref_pitch_mean = np.mean(ref_pitch, axis=0)
        gen_pitch_mean = np.mean(gen_pitch, axis=0)
        metrics['pitch_corr'] = scipy.stats.pearsonr(ref_pitch_mean, gen_pitch_mean)[0]

        # 5. Spectral Convergence
        ref_stft = np.abs(librosa.stft(ref_audio_trim))
        gen_stft = np.abs(librosa.stft(gen_audio_trim))
        metrics['spec_conv'] = np.linalg.norm(ref_stft - gen_stft, 'fro') / np.linalg.norm(ref_stft, 'fro')

        # 6. Energy Ratio
        ref_energy = np.sum(ref_audio_trim ** 2)
        gen_energy = np.sum(gen_audio_trim ** 2)
        metrics['energy_ratio'] = gen_energy / ref_energy

        # 7. SNR (Signal-to-Noise Ratio)
        noise = gen_audio_trim - ref_audio_trim
        metrics['snr'] = 10 * np.log10(np.sum(ref_audio_trim ** 2) / np.sum(noise ** 2))

        results[model_name] = metrics

    return results

def format_results(results):
    """
    Format the evaluation results into a readable table.
    """
    headers = ['Model', 'PESQ', 'STOI', 'MCD', 'Pitch Corr', 'Spec Conv', 'Energy Ratio', 'SNR (dB)']
    rows = []

    for model, metrics in results.items():
        row = [
            model,
            f"{metrics['pesq']:.3f}" if metrics['pesq'] is not None else "N/A",
            f"{metrics['stoi']:.3f}" if metrics['stoi'] is not None else "N/A",
            f"{metrics['mcd']:.3f}",
            f"{metrics['pitch_corr']:.3f}",
            f"{metrics['spec_conv']:.3f}",
            f"{metrics['energy_ratio']:.3f}",
            f"{metrics['snr']:.3f}"
        ]
        rows.append(row)

    return headers, rows

class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy data types"""
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                          np.int16, np.int32, np.int64, np.uint8,
                          np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def save_results(results, output_dir="./evaluation_results"):
    """
    Save evaluation results in multiple formats.

    Parameters:
    results (dict): Dictionary containing evaluation metrics
    output_dir (str): Directory to save the results
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    # Generate timestamp for unique filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Save as JSON
    json_path = os.path.join(output_dir, f"tts_evaluation_{timestamp}.json")
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=4, cls=NumpyEncoder)

    # 2. Save as CSV
    csv_path = os.path.join(output_dir, f"tts_evaluation_{timestamp}.csv")
    headers, rows = format_results(results)
    with open(csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)

    # 3. Save as Excel
    excel_path = os.path.join(output_dir, f"tts_evaluation_{timestamp}.xlsx")
    df = pd.DataFrame(rows, columns=headers)
    df.to_excel(excel_path, index=False)

    print(f"\nResults saved successfully:")
    print(f"JSON: {json_path}")
    print(f"CSV: {csv_path}")
    print(f"Excel: {excel_path}")

    return {
        'json_path': json_path,
        'csv_path': csv_path,
        'excel_path': excel_path
    }

In [15]:
# Example usage
reference_path = "/content/reference.wav"
generated_paths = {
    "OpenVoice": "/content/OpenVoice.wav",
    "CoquiTTS": "/content/CoquiTTS.wav",
    "F5-TTS": "/content/F5-TTS.wav",
    "E2-TTS": "/content/E2-TTS.wav"
}



In [16]:
# Run evaluation
results = evaluate_tts_outputs(reference_path, generated_paths, target_sr=16000)

# Save results (will now work with NumPy data types)
save_paths = save_results(results, output_dir="/content/evaluation_results")

# Display results in notebook
headers, rows = format_results(results)
df = pd.DataFrame(rows, columns=headers)
display(df)


Results saved successfully:
JSON: /content/evaluation_results/tts_evaluation_20250212_070759.json
CSV: /content/evaluation_results/tts_evaluation_20250212_070759.csv
Excel: /content/evaluation_results/tts_evaluation_20250212_070759.xlsx


Unnamed: 0,Model,PESQ,STOI,MCD,Pitch Corr,Spec Conv,Energy Ratio,SNR (dB)
0,OpenVoice,1.165,0.136,37.988,-0.027,3.475,12.305,-11.193
1,CoquiTTS,1.727,0.143,203.193,0.012,6.675,45.896,-16.717
2,F5-TTS,1.782,0.171,174.265,0.06,6.082,39.209,-16.065
3,E2-TTS,2.281,0.165,158.578,-0.051,5.76,34.939,-15.551
