In [1]:
import sys
sys.path.append("..")

In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
# from similarity_functions import *
import os
import dotenv
from sklearn.metrics.pairwise import cosine_similarity
import librosa
import numpy


In [23]:
librosa.__version__

'0.10.0.post2'

In [24]:
!python --version

Python 3.10.10


In [25]:
numpy.__version__

'1.24.4'

In [26]:
# Load the environment from the .env file.
dotenv.load_dotenv(dotenv.find_dotenv())

# Directory where mp3 are stored.
AUDIO_DIR = os.environ.get('AUDIO_DIR')
AUDIO_DIR

'./data/fma_small/'

In [27]:
!pip list | grep librosa

librosa                       0.8.1


In [28]:
!pip list | grep numpy

numpy                         1.24.3


# Similarity Functions

In [29]:
track_id1 = "002"
track_id2 = "255"
filename1 = os.path.join('.'+ AUDIO_DIR, "000", "000" + track_id1 + '.mp3')
filename2 =  os.path.join('.'+ AUDIO_DIR, "000", "000" + track_id2 + '.mp3')
print('files are: ', filename1,' and ', filename2)

audio1, sr1 = load_and_convert_to_mono(filename1)
audio2, sr2 = load_and_convert_to_mono(filename2)
assert sr2==sr2

get_resampling_similarity_from_frame(audio1, audio2), get_resampling_similarity_from_track_id(track_id1, track_id2)

files are:  ../data/fma_small/000/000002.mp3  and  ../data/fma_small/000/000255.mp3


(0.021866493, 0.021866493)

In [30]:
track_id1 = "002"
track_id2 = "005"
filename1 = os.path.join('.'+ AUDIO_DIR, "000", "000" + track_id1 + '.mp3')
filename2 =  os.path.join('.'+ AUDIO_DIR, "000", "000" + track_id2 + '.mp3')
print('files are: ', filename1,' and ', filename2)

audio1, sr1 = load_and_convert_to_mono(filename1)
audio2, sr2 = load_and_convert_to_mono(filename2)
assert sr2==sr2

get_resampling_similarity_from_frame(audio1, audio2),  get_resampling_similarity_from_track_id(track_id1, track_id2), get_resampling_similarity_from_track_id(track_id1, track_id2)

files are:  ../data/fma_small/000/000002.mp3  and  ../data/fma_small/000/000005.mp3


(0.01560968, 0.01560968, 0.01560968)

In [31]:
get_similarity_within_same_song_by_resampling(track_id1), get_similarity_within_same_song_by_resampling(track_id2)

(0.18930985, 0.0668598)

# Features

In [36]:
from features import *
from utils import *
import pandas as pd

In [37]:
feature_sizes = dict(chroma_stft=12, chroma_cqt=12, chroma_cens=12,
                     tonnetz=6, mfcc=20, rms=1, zcr=1,
                     spectral_centroid=1, spectral_bandwidth=1,
                     spectral_contrast=7, spectral_rolloff=1)
moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max')

columns = []
for name, size in feature_sizes.items():
    for moment in moments:
        it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size))
        columns.extend(it)

names = ('feature', 'statistics', 'number')
columns = pd.MultiIndex.from_tuples(columns, names=names)

# More efficient to slice if indexes are sorted.
columns.sort_values()

MultiIndex([('chroma_cens', 'kurtosis', '01'),
            ('chroma_cens', 'kurtosis', '02'),
            ('chroma_cens', 'kurtosis', '03'),
            ('chroma_cens', 'kurtosis', '04'),
            ('chroma_cens', 'kurtosis', '05'),
            ('chroma_cens', 'kurtosis', '06'),
            ('chroma_cens', 'kurtosis', '07'),
            ('chroma_cens', 'kurtosis', '08'),
            ('chroma_cens', 'kurtosis', '09'),
            ('chroma_cens', 'kurtosis', '10'),
            ...
            (    'tonnetz',      'std', '04'),
            (    'tonnetz',      'std', '05'),
            (    'tonnetz',      'std', '06'),
            (        'zcr', 'kurtosis', '01'),
            (        'zcr',      'max', '01'),
            (        'zcr',     'mean', '01'),
            (        'zcr',   'median', '01'),
            (        'zcr',      'min', '01'),
            (        'zcr',     'skew', '01'),
            (        'zcr',      'std', '01')],
           names=['feature', 'statistics', 

In [38]:
tid = "002"
out = compute_features(tid)

start
filepath =  ../data/fma_small/000/000002.mp3


In [39]:
out

feature      statistics  number
chroma_cens  kurtosis    01       -0.217359
                         02       -0.726509
                         03       -0.491446
                         04       -0.721771
                         05       -0.545604
                                     ...   
zcr          mean        01        0.098364
             median      01        0.078613
             min         01        0.006348
             skew        01        1.825834
             std         01        0.068405
Name: 002, Length: 518, dtype: float64

In [40]:
type(out)

pandas.core.series.Series

In [41]:
out.values[:10]

array([-0.21735859, -0.72650862, -0.49144626, -0.72177076, -0.54560375,
        0.90352058,  0.78698349,  0.91715646, -0.79314399,  0.45264697])

In [42]:
out.index

MultiIndex([('chroma_cens', 'kurtosis', '01'),
            ('chroma_cens', 'kurtosis', '02'),
            ('chroma_cens', 'kurtosis', '03'),
            ('chroma_cens', 'kurtosis', '04'),
            ('chroma_cens', 'kurtosis', '05'),
            ('chroma_cens', 'kurtosis', '06'),
            ('chroma_cens', 'kurtosis', '07'),
            ('chroma_cens', 'kurtosis', '08'),
            ('chroma_cens', 'kurtosis', '09'),
            ('chroma_cens', 'kurtosis', '10'),
            ...
            (    'tonnetz',      'std', '04'),
            (    'tonnetz',      'std', '05'),
            (    'tonnetz',      'std', '06'),
            (        'zcr', 'kurtosis', '01'),
            (        'zcr',      'max', '01'),
            (        'zcr',     'mean', '01'),
            (        'zcr',   'median', '01'),
            (        'zcr',      'min', '01'),
            (        'zcr',     'skew', '01'),
            (        'zcr',      'std', '01')],
           names=['feature', 'statistics', 

## zero-crossing rate

In [43]:
def load_and_convert_to_mono_from_track_id(track_id: str) -> tuple:

    filename = os.path.join('.'+ AUDIO_DIR, "000", "000" + track_id + '.mp3')

    audio, sr = load_and_convert_to_mono(filename)

    return audio, sr


def resample_feature(feature: np.ndarray, target_length: int) -> np.ndarray:
    """
    Resamples a feature to a target length using linear interpolation.

    Parameters:
        feature (numpy array): Input feature to resample.
        target_length (int): Target length for resampling.

    Returns:
        resampled_feature (numpy array): Resampled feature.
    """
    return librosa.resample(feature, len(feature), target_length)

In [44]:
def get_zero_crossing_rate_feature_from_track_id(track_id: str) -> float:

    """
    Computes the zero-crossing rate feature of an audio file.

    Parameters:
        audio_file (str): Path to the audio file.

    Returns:
        zero_crossing_rate (numpy array): Zero-crossing rate feature.
    """

    audio, sr = load_and_convert_to_mono_from_track_id(track_id)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)

    return zero_crossing_rate[0]

In [45]:
tid1 = "002"
tid2 = "005"
res1 = get_zero_crossing_rate_feature_from_track_id(tid1)
res2 = get_zero_crossing_rate_feature_from_track_id(tid2)
res1.shape, res2.shape

((2582,), (2585,))

In [46]:
def compute_zero_crossing_rate(audio_file: str) -> np.ndarray:
    """
    Computes the zero-crossing rate feature of an audio file.

    Parameters:
        audio_file (str): Path to the audio file.

    Returns:
        zero_crossing_rate (numpy array): Zero-crossing rate feature.
    """
    y, sr = librosa.load(audio_file)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]
    return zero_crossing_rate


def compute_zero_crossing_rate_from_track_id(tid: str) -> np.ndarray:
    """
    Computes the zero-crossing rate feature of an audio file.

    Parameters:
        audio_file (str): Path to the audio file.

    Returns:
        zero_crossing_rate (numpy array): Zero-crossing rate feature.
    """

    audio, sr = load_and_convert_to_mono_from_track_id(tid)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)[0]
    return zero_crossing_rate

def resample_feature(feature: np.ndarray, target_length: int) -> np.ndarray:
    """
    Resamples a feature to a target length using linear interpolation.

    Parameters:
        feature (numpy array): Input feature to resample.
        target_length (int): Target length for resampling.

    Returns:
        resampled_feature (numpy array): Resampled feature.
    """
    return librosa.resample(feature, len(feature), target_length)

def normalize_feature(feature: np.ndarray) -> np.ndarray:
    """
    Normalizes a feature to have zero mean and unit variance.

    Parameters:
        feature (numpy array): Input feature to normalize.

    Returns:
        normalized_feature (numpy array): Normalized feature.
    """
    return (feature - np.mean(feature)) / np.std(feature)

def calculate_similarity(feature1: np.ndarray, feature2: np.ndarray, Euclidean=True) -> float:
    """
    Calculates the similarity score between two features.

    Parameters:
        feature1 (numpy array): First input feature.
        feature2 (numpy array): Second input feature.

    Returns:
        similarity_score (float): Similarity score between the two features.
    """
    # Resample features to a common length
    target_length = max(len(feature1), len(feature2))
    feature1_resampled = resample_feature(feature1, target_length)
    feature2_resampled = resample_feature(feature2, target_length)

    # Normalize the resampled features
    feature1_normalized = normalize_feature(feature1_resampled)
    feature2_normalized = normalize_feature(feature2_resampled)

    if Euclidean:
        # Compute the Euclidean distance between the features
        distance = np.linalg.norm(feature1_normalized - feature2_normalized)
    
        # Calculate similarity score (lower distance implies higher similarity)
        similarity_score = 1 / (1 + distance)

    return similarity_score

    


In [51]:
tid1 = "002"
tid2 = "005"
res1 = compute_zero_crossing_rate_from_track_id(tid1)
res2 = compute_zero_crossing_rate_from_track_id(tid2)
res1.shape, res2.shape, 
# calculate_similarity(res1, res2)

((2582,), (2585,))

In [48]:
librosa.__version__

'0.10.0.post2'

In [49]:
import sys
print(sys.version)

3.11.4 (main, Jul  5 2023, 08:41:25) [Clang 14.0.6 ]


In [50]:
librosa.__version__
!python --version

Python 3.10.10


In [54]:
import numpy as np
import librosa
from librosa.feature import delta

def normalize_feature(feature: np.ndarray) -> np.ndarray:
    """
    Normalize the feature to have zero mean and unit variance.

    Parameters:
        feature (numpy array): Input feature.

    Returns:
        normalized_feature (numpy array): Normalized feature.
    """
    return (feature - np.mean(feature)) / np.std(feature)

def compute_similarity_with_dtw(feature1: np.ndarray, feature2: np.ndarray) -> float:
    """
    Compute similarity between two audio features using dynamic time warping.

    Parameters:
        feature1 (numpy array): First input feature.
        feature2 (numpy array): Second input feature.

    Returns:
        similarity (float): Similarity score between the two features.
    """
    # Normalize the features
    feature1_norm = normalize_feature(feature1)
    feature2_norm = normalize_feature(feature2)

    # Compute delta features to capture temporal changes
    feature1_delta = delta(feature1_norm)
    feature2_delta = delta(feature2_norm)

    # Compute the DTW distance
    dtw_distance, _ = librosa.sequence.dtw(X=feature1_delta, Y=feature2_delta)

    # Invert the distance to get similarity
    similarity = 1.0 / (1.0 + dtw_distance)

    return similarity


In [57]:
tid1 = "002"
tid2 = "005"
res1 = compute_zero_crossing_rate_from_track_id(tid1)
res2 = compute_zero_crossing_rate_from_track_id(tid2)
res1.shape, res2.shape, 
# calculate_similarity(res1, res2)

((2582,), (2585,))

In [None]:
feature1_delta = delta(feature1_norm)
feature2_delta = delta(feature2_norm)

In [56]:
compute_similarity_with_dtw(res1, res2)

array([[0.66493028, 0.49804911, 0.39812861, ..., 0.00070293, 0.00070268,
        0.00070243],
       [0.49804911, 0.49804911, 0.39812861, ..., 0.00070293, 0.00070268,
        0.00070243],
       [0.39812861, 0.39812861, 0.39812861, ..., 0.00070293, 0.00070268,
        0.00070243],
       ...,
       [0.0029391 , 0.0029391 , 0.0029391 , ..., 0.00596088, 0.00595776,
        0.00595464],
       [0.00293743, 0.00293743, 0.00293743, ..., 0.00595736, 0.00595424,
        0.00595113],
       [0.00293576, 0.00293576, 0.00293576, ..., 0.00595384, 0.00595073,
        0.00594761]])