# Audio Similarity Evaluation Notebook

This notebook evaluates the similarity between two audio files using modularized metrics for:

1. **Harmony / Tonality** (Key, Chroma, Chord Sequences)
2. **Rhythm / Meter** (Tempo, Beat, Downbeat)
3. **Structural Form** (Segmentation, Boundaries)
4. **Non-target Stems** (Source Separation, Interference)
5. **Melodic Content / Motifs** (Melody Extraction, Motif Similarity)

Each metric is implemented as a separate function for clarity and reusability.

In [4]:
# Import required libraries for all metrics
import numpy as np
import librosa
import librosa.display
import essentia.standard as ess
import madmom
import mir_eval
import msaf
import museval
import soundfile as sf
from scipy.spatial.distance import cdist
from scipy.spatial.distance import cosine

ImportError: cannot import name 'MutableSequence' from 'collections' (/home/xunyijiang/miniconda3/envs/audio/lib/python3.11/collections/__init__.py)

In [3]:
# !pip install mir_eval
# !pip install msaf
# !pip install museval
# !pip install madmom

In [None]:
# Load two audio files as input
def load_audio(file_path, sr=22050):
    y, sr = librosa.load(file_path, sr=sr, mono=True)
    return y, sr

# Example usage:
audio_path_1 = 'audio1.wav'  # Replace with your file path
audio_path_2 = 'audio2.wav'  # Replace with your file path
y1, sr1 = load_audio(audio_path_1)
y2, sr2 = load_audio(audio_path_2)

In [None]:
# 1. Harmony / Tonality Metrics
def extract_key_scale(y, sr):
    audio = ess.MonoLoader(filename=None, sampleRate=sr)(y) if isinstance(y, str) else y
    key_extractor = ess.KeyExtractor()
    key, scale, strength = key_extractor(audio)
    return key, scale

def key_scale_similarity(key1, scale1, key2, scale2):
    # Simple equality or circle-of-fifths distance (placeholder)
    return int(key1 == key2 and scale1 == scale2)

def chroma_similarity(y1, sr1, y2, sr2):
    chroma1 = librosa.feature.chroma_cqt(y=y1, sr=sr1)
    chroma2 = librosa.feature.chroma_cqt(y=y2, sr=sr2)
    # Mean chroma correlation
    sim = np.corrcoef(np.mean(chroma1, axis=1), np.mean(chroma2, axis=1))[0,1]
    return sim

def chroma_dtw(y1, sr1, y2, sr2):
    chroma1 = librosa.feature.chroma_cqt(y=y1, sr=sr1)
    chroma2 = librosa.feature.chroma_cqt(y=y2, sr=sr2)
    D, wp = librosa.sequence.dtw(chroma1, chroma2, metric='cosine')
    return D[-1, -1]

def extract_chords_madmom(audio_path):
    proc = madmom.features.chords.CNNChordRecognitionProcessor()
    chords = proc(audio_path)
    return chords

def chord_eval_mir_eval(ref_chords, est_chords):
    ref_intervals, ref_labels = ref_chords[:, :2], ref_chords[:, 2]
    est_intervals, est_labels = est_chords[:, :2], est_chords[:, 2]
    scores = mir_eval.chord.evaluate(ref_intervals, ref_labels, est_intervals, est_labels)
    return scores

In [7]:
import numpy as np
from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor
print("madmom imported successfully")

AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [8]:
# Fix for madmom compatibility with Python 3.10+
import sys
try:
    # Try to patch the collections module for madmom compatibility
    import collections
    if not hasattr(collections, 'MutableSequence'):
        import collections.abc
        collections.MutableSequence = collections.abc.MutableSequence
        collections.Iterable = collections.abc.Iterable
        collections.Mapping = collections.abc.Mapping
        collections.MutableMapping = collections.abc.MutableMapping
        collections.Sequence = collections.abc.Sequence
        print("Applied Python 3.10+ compatibility patch for madmom")
except Exception as e:
    print(f"Could not apply compatibility patch: {e}")

# Now try to import madmom
try:
    import numpy as np
    from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor
    print("madmom imported successfully")
except Exception as e:
    print(f"madmom import failed: {e}")

madmom import failed: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [None]:
# 2. Rhythm / Meter Metrics
def extract_tempo_beats(y, sr):
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, units='time')
    return tempo, beats

def tempo_difference(tempo1, tempo2):
    return abs(tempo1 - tempo2)

def beat_f_measure(ref_beats, est_beats):
    scores = mir_eval.beat.evaluate(ref_beats, est_beats)
    return scores['F-measure']

def extract_downbeats_madmom(audio_path):
    proc = madmom.features.downbeats.RNNDownBeatProcessor()
    act = proc(audio_path)
    dbn = madmom.features.downbeats.DBNDownBeatTrackingProcessor(beats_per_bar=[3,4], fps=100)
    downbeats = dbn(act)
    return downbeats

def downbeat_alignment(ref_downbeats, est_downbeats, tolerance=0.07):
    # Precision/recall within tolerance window
    ref_set = set(np.round(ref_downbeats/tolerance))
    est_set = set(np.round(est_downbeats/tolerance))
    precision = len(ref_set & est_set) / len(est_set) if est_set else 0
    recall = len(ref_set & est_set) / len(ref_set) if ref_set else 0
    return precision, recall

In [None]:
# 3. Structural Form Metrics
def segment_audio_msaf(audio_path, feature='pcp', algo='sf'):
    boundaries, labels = msaf.segment(audio_path, feature=feature, algo=algo)
    return boundaries, labels

def segment_audio_librosa(y, sr):
    # Novelty-based segmentation (simple example)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    boundaries = librosa.segment.agglomerative(onset_env.reshape(1, -1), k=4)
    return boundaries

def boundary_f_measure(ref_boundaries, est_boundaries, window=3):
    precision, recall, f = mir_eval.segment.detection(ref_boundaries, est_boundaries, window=window)
    return f

def pairwise_ari(ref_boundaries, ref_labels, est_boundaries, est_labels):
    ari = mir_eval.segment.pairwise(ref_boundaries, ref_labels, est_boundaries, est_labels)
    return ari

In [None]:
# 4. Non-target Stems (Extraneous Content) Metrics
def bss_eval_sources(ref_sources, est_sources):
    sdr, sir, sar, perm = mir_eval.separation.bss_eval_sources(ref_sources, est_sources)
    return {'SDR': sdr, 'SIR': sir, 'SAR': sar, 'perm': perm}

def museval_bss_eval(ref_sources, est_sources):
    # museval wrapper (requires multi-channel input)
    import museval
    scores = museval.evaluate(ref_sources, est_sources)
    return scores

def instrument_tag_mismatch(ref_tags, est_tags):
    # Simple mismatch rate between instrument tags (placeholder)
    mismatches = sum(r != e for r, e in zip(ref_tags, est_tags))
    return mismatches / max(len(ref_tags), 1)

In [None]:
# 5. Melodic Content / Motifs Metrics
def extract_melody_librosa(y, sr, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')):
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax)
    return f0, voiced_flag

def extract_melody_essentia(audio_path):
    melody_extractor = ess.MelodyExtractor()
    f0, pitch_confidence = melody_extractor(audio_path)
    return f0, pitch_confidence

def melody_accuracy(ref_f0, est_f0, sr):
    scores = mir_eval.melody.evaluate(ref_f0, est_f0, sr)
    return scores

def motif_dtw(f0_1, f0_2):
    # DTW distance between two pitch contours (ignoring unvoiced)
    mask1 = ~np.isnan(f0_1)
    mask2 = ~np.isnan(f0_2)
    f0_1_valid = f0_1[mask1]
    f0_2_valid = f0_2[mask2]
    D, wp = librosa.sequence.dtw(f0_1_valid.reshape(1, -1), f0_2_valid.reshape(1, -1), metric='euclidean')
    return D[-1, -1]

## Usage Example
- Load your two audio files above (replace the file paths as needed).
- Call the relevant metric functions for each evaluation aspect.
- Aggregate or display the results as needed for your use case.

Each function is modular and can be used independently or in combination for comprehensive audio similarity evaluation.